svjack commited on May 27, 2025

Commit

7bc5051

verified ·

1 Parent(s): 7c3fd88

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/FUNDING.yml +3 -0
.gitignore +8 -0
.ipynb_checkpoints/README-checkpoint.md +123 -0
.python-version +1 -0
README.md +123 -0
cache_latents.py +339 -0
cache_text_encoder_outputs.py +214 -0
convert_lora.py +137 -0
dataset/__init__.py +0 -0
dataset/config_utils.py +384 -0
dataset/dataset_config.md +486 -0
dataset/image_video_dataset.py +1786 -0
fpack_cache_latents.py +454 -0
fpack_cache_text_encoder_outputs.py +110 -0
fpack_generate_video.py +1711 -0
frame_pack/__init__.py +0 -0
frame_pack/bucket_tools.py +30 -0
frame_pack/clip_vision.py +14 -0
frame_pack/framepack_utils.py +273 -0
frame_pack/hunyuan.py +134 -0
frame_pack/hunyuan_video_packed.py +2015 -0
frame_pack/k_diffusion_hunyuan.py +128 -0
frame_pack/uni_pc_fm.py +142 -0
frame_pack/utils.py +617 -0
frame_pack/wrapper.py +51 -0
framepack_edit_output/framepack-edit-lora-000001.safetensors +3 -0
framepack_edit_output/framepack-edit-lora-000002.safetensors +3 -0
framepack_edit_output/framepack-edit-lora-000003.safetensors +3 -0
framepack_edit_output/framepack-edit-lora-000004.safetensors +3 -0
framepack_edit_output/framepack-edit-lora-000005.safetensors +3 -0
framepack_edit_output/framepack-edit-lora-000006.safetensors +3 -0
hunyuan_model/__init__.py +0 -0
hunyuan_model/activation_layers.py +23 -0
hunyuan_model/attention.py +295 -0
hunyuan_model/autoencoder_kl_causal_3d.py +609 -0
hunyuan_model/embed_layers.py +132 -0
hunyuan_model/fp8_optimization.py +39 -0
hunyuan_model/helpers.py +40 -0
hunyuan_model/mlp_layers.py +118 -0
hunyuan_model/models.py +1044 -0
hunyuan_model/modulate_layers.py +76 -0
hunyuan_model/norm_layers.py +79 -0
hunyuan_model/pipeline_hunyuan_video.py +1100 -0
hunyuan_model/posemb_layers.py +310 -0
hunyuan_model/text_encoder.py +710 -0
hunyuan_model/token_refiner.py +245 -0
hunyuan_model/vae.py +446 -0
hv_generate_video.py +936 -0
merge_lora.py +63 -0
modules/__init__.py +0 -0

.github/FUNDING.yml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # These are supported funding model platforms
2	+
3	+ github: kohya-ss

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+.venv
+venv/
+logs/
+uv.lock
+main.exp
+main.lib
+main.obj

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# FramePack Image Edit Early Lora
+This repository contains the necessary steps and scripts to generate A edit of the Image using a image-to-video model.
+The model leverages LoRA (Low-Rank Adaptation) weights and pre-trained components to create Edit Image based on a input Image and textual prompts.
+## Prerequisites
+Before proceeding, ensure that you have the following installed on your system:
+• **Ubuntu** (or a compatible Linux distribution)
+• **Python 3.x**
+• **pip** (Python package manager)
+• **Git**
+• **Git LFS** (Git Large File Storage)
+• **FFmpeg**
+## Installation
+1. **Update and Install Dependencies**
+   ```bash
+   sudo apt-get update && sudo apt-get install cbm git-lfs ffmpeg
+   ```
+2. **Clone the Repository**
+   ```bash
+   git clone https://huggingface.co/svjack/FramePack_Image_Edit_Lora_Early
+   cd FramePack_Image_Edit_Lora_Early
+   ```
+3. **Install Python Dependencies**
+   ```bash
+   pip install torch torchvision
+   pip install -r requirements.txt
+   pip install ascii-magic matplotlib tensorboard huggingface_hub datasets
+   pip install moviepy==1.0.3
+   pip install sageattention==1.0.6
+   ```
+4. **Download Model Weights**
+   ```bash
+    git clone https://huggingface.co/lllyasviel/FramePackI2V_HY
+    git clone https://huggingface.co/hunyuanvideo-community/HunyuanVideo
+    git clone https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged
+    git clone https://huggingface.co/Comfy-Org/sigclip_vision_384
+   ```
+## Usage
+To Edit a Image, use the `fpack_generate_video.py` script with the appropriate parameters. Below are examples of how to do it.
+* 1 Add a cat
+- Input
+```python
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path xiang_image.jpg \
+    --prompt "add a cat into the picture" \
+    --video_size 512  512 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --video_sections 1 --output_type latent_images --one_frame_inference zero_post \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_edit_output/framepack-edit-lora-000005.safetensors
+```
+- Output
+* 2 Change Background
+- Input
+```python
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path wanye.jpg \
+    --prompt "Change the background into a restaurant in anime style. Keep the character's eye colors and white hair unchanged." \
+    --video_size 512  512 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --video_sections 1 --output_type latent_images --one_frame_inference zero_post \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_edit_output/framepack-edit-lora-000005.safetensors
+```
+- Output
+* 3 Place Train into landscape
+- Input
+```python
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path train.jpg \
+    --prompt "place the train into a beautiful landscape" \
+    --video_size 512  512 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --video_sections 1 --output_type latent_images --one_frame_inference zero_post \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_edit_output/framepack-edit-lora-000005.safetensors
+```
+- Output

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# FramePack Image Edit Early Lora
+This repository contains the necessary steps and scripts to generate A edit of the Image using a image-to-video model.
+The model leverages LoRA (Low-Rank Adaptation) weights and pre-trained components to create Edit Image based on a input Image and textual prompts.
+## Prerequisites
+Before proceeding, ensure that you have the following installed on your system:
+• **Ubuntu** (or a compatible Linux distribution)
+• **Python 3.x**
+• **pip** (Python package manager)
+• **Git**
+• **Git LFS** (Git Large File Storage)
+• **FFmpeg**
+## Installation
+1. **Update and Install Dependencies**
+   ```bash
+   sudo apt-get update && sudo apt-get install cbm git-lfs ffmpeg
+   ```
+2. **Clone the Repository**
+   ```bash
+   git clone https://huggingface.co/svjack/FramePack_Image_Edit_Lora_Early
+   cd FramePack_Image_Edit_Lora_Early
+   ```
+3. **Install Python Dependencies**
+   ```bash
+   pip install torch torchvision
+   pip install -r requirements.txt
+   pip install ascii-magic matplotlib tensorboard huggingface_hub datasets
+   pip install moviepy==1.0.3
+   pip install sageattention==1.0.6
+   ```
+4. **Download Model Weights**
+   ```bash
+    git clone https://huggingface.co/lllyasviel/FramePackI2V_HY
+    git clone https://huggingface.co/hunyuanvideo-community/HunyuanVideo
+    git clone https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged
+    git clone https://huggingface.co/Comfy-Org/sigclip_vision_384
+   ```
+## Usage
+To Edit a Image, use the `fpack_generate_video.py` script with the appropriate parameters. Below are examples of how to do it.
+* 1 Add a cat
+- Input
+```python
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path xiang_image.jpg \
+    --prompt "add a cat into the picture" \
+    --video_size 512  512 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --video_sections 1 --output_type latent_images --one_frame_inference zero_post \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_edit_output/framepack-edit-lora-000005.safetensors
+```
+- Output
+* 2 Change Background
+- Input
+```python
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path wanye.jpg \
+    --prompt "Change the background into a restaurant in anime style. Keep the character's eye colors and white hair unchanged." \
+    --video_size 512  512 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --video_sections 1 --output_type latent_images --one_frame_inference zero_post \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_edit_output/framepack-edit-lora-000005.safetensors
+```
+- Output
+* 3 Place Train into landscape
+- Input
+```python
+python fpack_generate_video.py \
+    --dit FramePackI2V_HY/diffusion_pytorch_model-00001-of-00003.safetensors \
+    --vae HunyuanVideo/vae/diffusion_pytorch_model.safetensors \
+    --text_encoder1 HunyuanVideo_repackaged/split_files/text_encoders/llava_llama3_fp16.safetensors \
+    --text_encoder2 HunyuanVideo_repackaged/split_files/text_encoders/clip_l.safetensors \
+    --image_encoder sigclip_vision_384/sigclip_vision_patch14_384.safetensors \
+    --image_path train.jpg \
+    --prompt "place the train into a beautiful landscape" \
+    --video_size 512  512 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path save --video_sections 1 --output_type latent_images --one_frame_inference zero_post \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight framepack_edit_output/framepack-edit-lora-000005.safetensors
+```
+- Output

cache_latents.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import argparse
+import os
+import glob
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from PIL import Image
+import logging
+from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache, ARCHITECTURE_HUNYUAN_VIDEO
+from hunyuan_model.vae import load_vae
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from utils.model_utils import str_to_dtype
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def show_image(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]]) -> int:
+    import cv2
+    imgs = (
+        [image]
+        if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
+        else [image[0], image[-1]]
+    )
+    if len(imgs) > 1:
+        print(f"Number of images: {len(image)}")
+    for i, img in enumerate(imgs):
+        if len(imgs) > 1:
+            print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
+        else:
+            print(f"Image: {img.shape}")
+        cv2_img = np.array(img) if isinstance(img, Image.Image) else img
+        cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_RGB2BGR)
+        cv2.imshow("image", cv2_img)
+        k = cv2.waitKey(0)
+        cv2.destroyAllWindows()
+        if k == ord("q") or k == ord("d"):
+            return k
+    return k
+def show_console(
+    image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]],
+    width: int,
+    back: str,
+    interactive: bool = False,
+) -> int:
+    from ascii_magic import from_pillow_image, Back
+    back = None
+    if back is not None:
+        back = getattr(Back, back.upper())
+    k = None
+    imgs = (
+        [image]
+        if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
+        else [image[0], image[-1]]
+    )
+    if len(imgs) > 1:
+        print(f"Number of images: {len(image)}")
+    for i, img in enumerate(imgs):
+        if len(imgs) > 1:
+            print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
+        else:
+            print(f"Image: {img.shape}")
+        pil_img = img if isinstance(img, Image.Image) else Image.fromarray(img)
+        ascii_img = from_pillow_image(pil_img)
+        ascii_img.to_terminal(columns=width, back=back)
+        if interactive:
+            k = input("Press q to quit, d to next dataset, other key to next: ")
+            if k == "q" or k == "d":
+                return ord(k)
+    if not interactive:
+        return ord(" ")
+    return ord(k) if k else ord(" ")
+def save_video(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]], cache_path: str, fps: int = 24):
+    import av
+    directory = os.path.dirname(cache_path)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image):
+        # save image
+        image_path = cache_path.replace(".safetensors", ".jpg")
+        img = image if isinstance(image, Image.Image) else Image.fromarray(image)
+        img.save(image_path)
+        print(f"Saved image: {image_path}")
+    else:
+        imgs = image
+        print(f"Number of images: {len(imgs)}")
+        # save video
+        video_path = cache_path.replace(".safetensors", ".mp4")
+        height, width = imgs[0].shape[0:2]
+        # create output container
+        container = av.open(video_path, mode="w")
+        # create video stream
+        codec = "libx264"
+        pixel_format = "yuv420p"
+        stream = container.add_stream(codec, rate=fps)
+        stream.width = width
+        stream.height = height
+        stream.pix_fmt = pixel_format
+        stream.bit_rate = 1000000  # 1Mbit/s for preview quality
+        for frame_img in imgs:
+            if isinstance(frame_img, Image.Image):
+                frame = av.VideoFrame.from_image(frame_img)
+            else:
+                frame = av.VideoFrame.from_ndarray(frame_img, format="rgb24")
+            packets = stream.encode(frame)
+            for packet in packets:
+                container.mux(packet)
+        for packet in stream.encode():
+            container.mux(packet)
+        container.close()
+        print(f"Saved video: {video_path}")
+def show_datasets(
+    datasets: list[BaseDataset],
+    debug_mode: str,
+    console_width: int,
+    console_back: str,
+    console_num_images: Optional[int],
+    fps: int = 24,
+):
+    if debug_mode != "video":
+        print(f"d: next dataset, q: quit")
+    num_workers = max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        print(f"Dataset [{i}]")
+        batch_index = 0
+        num_images_to_show = console_num_images
+        k = None
+        for key, batch in dataset.retrieve_latent_cache_batches(num_workers):
+            print(f"bucket resolution: {key}, count: {len(batch)}")
+            for j, item_info in enumerate(batch):
+                item_info: ItemInfo
+                print(f"{batch_index}-{j}: {item_info}")
+                if debug_mode == "image":
+                    k = show_image(item_info.content)
+                elif debug_mode == "console":
+                    k = show_console(item_info.content, console_width, console_back, console_num_images is None)
+                    if num_images_to_show is not None:
+                        num_images_to_show -= 1
+                        if num_images_to_show == 0:
+                            k = ord("d")  # next dataset
+                elif debug_mode == "video":
+                    save_video(item_info.content, item_info.latent_cache_path, fps)
+                    k = None  # save next video
+                if k == ord("q"):
+                    return
+                elif k == ord("d"):
+                    break
+            if k == ord("d"):
+                break
+            batch_index += 1
+def encode_and_save_batch(vae: AutoencoderKLCausal3D, batch: list[ItemInfo]):
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    h, w = contents.shape[3], contents.shape[4]
+    if h < 8 or w < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # print(f"encode batch: {contents.shape}")
+    with torch.no_grad():
+        latent = vae.encode(contents).latent_dist.sample()
+        # latent = latent * vae.config.scaling_factor
+    # # debug: decode and save
+    # with torch.no_grad():
+    #     latent_to_decode = latent / vae.config.scaling_factor
+    #     images = vae.decode(latent_to_decode, return_dict=False)[0]
+    #     images = (images / 2 + 0.5).clamp(0, 1)
+    #     images = images.cpu().float().numpy()
+    #     images = (images * 255).astype(np.uint8)
+    #     images = images.transpose(0, 2, 3, 4, 1)  # B, C, F, H, W -> B, F, H, W, C
+    #     for b in range(images.shape[0]):
+    #         for f in range(images.shape[1]):
+    #             fln = os.path.splitext(os.path.basename(batch[b].item_key))[0]
+    #             img = Image.fromarray(images[b, f])
+    #             img.save(f"./logs/decode_{fln}_{b}_{f:03d}.jpg")
+    for item, l in zip(batch, latent):
+        # print(f"save latent cache: {item.latent_cache_path}, latent shape: {l.shape}")
+        save_latent_cache(item, l)
+def encode_datasets(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
+    num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_latent_cache_paths = []
+        for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
+            all_latent_cache_paths.extend([item.latent_cache_path for item in batch])
+            if args.skip_existing:
+                filtered_batch = [item for item in batch if not os.path.exists(item.latent_cache_path)]
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+            bs = args.batch_size if args.batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+        # normalize paths
+        all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
+        all_latent_cache_paths = set(all_latent_cache_paths)
+        # remove old cache files not in the dataset
+        all_cache_files = dataset.get_all_latent_cache_files()
+        for cache_file in all_cache_files:
+            if os.path.normpath(cache_file) not in all_latent_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    if args.debug_mode is not None:
+        show_datasets(datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images)
+        return
+    assert args.vae is not None, "vae checkpoint is required"
+    # Load VAE model: HunyuanVideo VAE model is float16
+    vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
+    vae.eval()
+    logger.info(f"Loaded VAE: {vae.config}, dtype: {vae.dtype}")
+    if args.vae_chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
+        logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
+    if args.vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+    elif args.vae_tiling:
+        vae.enable_spatial_tiling(True)
+    # Encode images
+    def encode(one_batch: list[ItemInfo]):
+        encode_and_save_batch(vae, one_batch)
+    encode_datasets(datasets, encode, args)
+def setup_parser_common() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
+    parser.add_argument("--vae", type=str, required=False, default=None, help="path to vae checkpoint")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
+    parser.add_argument(
+        "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
+    parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
+    parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    parser.add_argument("--debug_mode", type=str, default=None, choices=["image", "console", "video"], help="debug mode")
+    parser.add_argument("--console_width", type=int, default=80, help="debug mode: console width")
+    parser.add_argument(
+        "--console_back", type=str, default=None, help="debug mode: console background color, one of ascii_magic.Back"
+    )
+    parser.add_argument(
+        "--console_num_images",
+        type=int,
+        default=None,
+        help="debug mode: not interactive, number of images to show for each dataset",
+    )
+    return parser
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument(
+        "--vae_tiling",
+        action="store_true",
+        help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import argparse
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+import accelerate
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, BaseDataset, ItemInfo, save_text_encoder_output_cache
+from hunyuan_model import text_encoder as text_encoder_module
+from hunyuan_model.text_encoder import TextEncoder
+import logging
+from utils.model_utils import str_to_dtype
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_prompt(text_encoder: TextEncoder, prompt: Union[str, list[str]]):
+    data_type = "video"  # video only, image is not supported
+    text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+    with torch.no_grad():
+        prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+    return prompt_outputs.hidden_state, prompt_outputs.attention_mask
+def encode_and_save_batch(
+    text_encoder: TextEncoder, batch: list[ItemInfo], is_llm: bool, accelerator: Optional[accelerate.Accelerator]
+):
+    prompts = [item.caption for item in batch]
+    # print(prompts)
+    # encode prompt
+    if accelerator is not None:
+        with accelerator.autocast():
+            prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
+    else:
+        prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
+    # # convert to fp16 if needed
+    # if prompt_embeds.dtype == torch.float32 and text_encoder.dtype != torch.float32:
+    #     prompt_embeds = prompt_embeds.to(text_encoder.dtype)
+    # save prompt cache
+    for item, embed, mask in zip(batch, prompt_embeds, prompt_mask):
+        save_text_encoder_output_cache(item, embed, mask, is_llm)
+def prepare_cache_files_and_paths(datasets: list[BaseDataset]):
+    all_cache_files_for_dataset = []  # exisiting cache files
+    all_cache_paths_for_dataset = []  # all cache paths in the dataset
+    for dataset in datasets:
+        all_cache_files = [os.path.normpath(file) for file in dataset.get_all_text_encoder_output_cache_files()]
+        all_cache_files = set(all_cache_files)
+        all_cache_files_for_dataset.append(all_cache_files)
+        all_cache_paths_for_dataset.append(set())
+    return all_cache_files_for_dataset, all_cache_paths_for_dataset
+def process_text_encoder_batches(
+    num_workers: Optional[int],
+    skip_existing: bool,
+    batch_size: int,
+    datasets: list[BaseDataset],
+    all_cache_files_for_dataset: list[set],
+    all_cache_paths_for_dataset: list[set],
+    encode: callable,
+):
+    num_workers = num_workers if num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_cache_files = all_cache_files_for_dataset[i]
+        all_cache_paths = all_cache_paths_for_dataset[i]
+        for batch in tqdm(dataset.retrieve_text_encoder_output_cache_batches(num_workers)):
+            # update cache files (it's ok if we update it multiple times)
+            all_cache_paths.update([os.path.normpath(item.text_encoder_output_cache_path) for item in batch])
+            # skip existing cache files
+            if skip_existing:
+                filtered_batch = [
+                    item for item in batch if not os.path.normpath(item.text_encoder_output_cache_path) in all_cache_files
+                ]
+                # print(f"Filtered {len(batch) - len(filtered_batch)} existing cache files")
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+            bs = batch_size if batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+def post_process_cache_files(
+    datasets: list[BaseDataset], all_cache_files_for_dataset: list[set], all_cache_paths_for_dataset: list[set], keep_cache: bool
+):
+    for i, dataset in enumerate(datasets):
+        all_cache_files = all_cache_files_for_dataset[i]
+        all_cache_paths = all_cache_paths_for_dataset[i]
+        for cache_file in all_cache_files:
+            if cache_file not in all_cache_paths:
+                if keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    # define accelerator for fp8 inference
+    accelerator = None
+    if args.fp8_llm:
+        accelerator = accelerate.Accelerator(mixed_precision="fp16")
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = prepare_cache_files_and_paths(datasets)
+    # Load Text Encoder 1
+    text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else str_to_dtype(args.text_encoder_dtype)
+    logger.info(f"loading text encoder 1: {args.text_encoder1}")
+    text_encoder_1 = text_encoder_module.load_text_encoder_1(args.text_encoder1, device, args.fp8_llm, text_encoder_dtype)
+    text_encoder_1.to(device=device)
+    # Encode with Text Encoder 1 (LLM)
+    logger.info("Encoding with Text Encoder 1")
+    def encode_for_text_encoder_1(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder_1, batch, is_llm=True, accelerator=accelerator)
+    process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder_1,
+    )
+    del text_encoder_1
+    # Load Text Encoder 2
+    logger.info(f"loading text encoder 2: {args.text_encoder2}")
+    text_encoder_2 = text_encoder_module.load_text_encoder_2(args.text_encoder2, device, text_encoder_dtype)
+    text_encoder_2.to(device=device)
+    # Encode with Text Encoder 2
+    logger.info("Encoding with Text Encoder 2")
+    def encode_for_text_encoder_2(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder_2, batch, is_llm=False, accelerator=None)
+    process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder_2,
+    )
+    del text_encoder_2
+    # remove cache files not in dataset
+    post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset, args.keep_cache)
+def setup_parser_common():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
+    parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
+    parser.add_argument(
+        "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
+    parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
+    parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    return parser
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

convert_lora.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from utils import model_utils
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def convert_from_diffusers(prefix, weights_sd):
+    # convert from diffusers(?) to default LoRA
+    # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
+    # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
+    # note: Diffusers has no alpha, so alpha is set to rank
+    new_weights_sd = {}
+    lora_dims = {}
+    for key, weight in weights_sd.items():
+        diffusers_prefix, key_body = key.split(".", 1)
+        if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
+            logger.warning(f"unexpected key: {key} in diffusers format")
+            continue
+        new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
+        new_weights_sd[new_key] = weight
+        lora_name = new_key.split(".")[0]  # before first dot
+        if lora_name not in lora_dims and "lora_down" in new_key:
+            lora_dims[lora_name] = weight.shape[0]
+    # add alpha with rank
+    for lora_name, dim in lora_dims.items():
+        new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
+    return new_weights_sd
+def convert_to_diffusers(prefix, weights_sd):
+    # convert from default LoRA to diffusers
+    # get alphas
+    lora_alphas = {}
+    for key, weight in weights_sd.items():
+        if key.startswith(prefix):
+            lora_name = key.split(".", 1)[0]  # before first dot
+            if lora_name not in lora_alphas and "alpha" in key:
+                lora_alphas[lora_name] = weight
+    new_weights_sd = {}
+    for key, weight in weights_sd.items():
+        if key.startswith(prefix):
+            if "alpha" in key:
+                continue
+            lora_name = key.split(".", 1)[0]  # before first dot
+            module_name = lora_name[len(prefix) :]  # remove "lora_unet_"
+            module_name = module_name.replace("_", ".")  # replace "_" with "."
+            if ".cross.attn." in module_name or ".self.attn." in module_name:
+                # Wan2.1 lora name to module name: ugly but works
+                module_name = module_name.replace("cross.attn", "cross_attn")  # fix cross attn
+                module_name = module_name.replace("self.attn", "self_attn")  # fix self attn
+                module_name = module_name.replace("k.img", "k_img")  # fix k img
+                module_name = module_name.replace("v.img", "v_img")  # fix v img
+            else:
+                # HunyuanVideo lora name to module name: ugly but works
+                module_name = module_name.replace("double.blocks.", "double_blocks.")  # fix double blocks
+                module_name = module_name.replace("single.blocks.", "single_blocks.")  # fix single blocks
+                module_name = module_name.replace("img.", "img_")  # fix img
+                module_name = module_name.replace("txt.", "txt_")  # fix txt
+                module_name = module_name.replace("attn.", "attn_")  # fix attn
+            diffusers_prefix = "diffusion_model"
+            if "lora_down" in key:
+                new_key = f"{diffusers_prefix}.{module_name}.lora_A.weight"
+                dim = weight.shape[0]
+            elif "lora_up" in key:
+                new_key = f"{diffusers_prefix}.{module_name}.lora_B.weight"
+                dim = weight.shape[1]
+            else:
+                logger.warning(f"unexpected key: {key} in default LoRA format")
+                continue
+            # scale weight by alpha
+            if lora_name in lora_alphas:
+                # we scale both down and up, so scale is sqrt
+                scale = lora_alphas[lora_name] / dim
+                scale = scale.sqrt()
+                weight = weight * scale
+            else:
+                logger.warning(f"missing alpha for {lora_name}")
+            new_weights_sd[new_key] = weight
+    return new_weights_sd
+def convert(input_file, output_file, target_format):
+    logger.info(f"loading {input_file}")
+    weights_sd = load_file(input_file)
+    with safe_open(input_file, framework="pt") as f:
+        metadata = f.metadata()
+    logger.info(f"converting to {target_format}")
+    prefix = "lora_unet_"
+    if target_format == "default":
+        new_weights_sd = convert_from_diffusers(prefix, weights_sd)
+        metadata = metadata or {}
+        model_utils.precalculate_safetensors_hashes(new_weights_sd, metadata)
+    elif target_format == "other":
+        new_weights_sd = convert_to_diffusers(prefix, weights_sd)
+    else:
+        raise ValueError(f"unknown target format: {target_format}")
+    logger.info(f"saving to {output_file}")
+    save_file(new_weights_sd, output_file, metadata=metadata)
+    logger.info("done")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Convert LoRA weights between default and other formats")
+    parser.add_argument("--input", type=str, required=True, help="input model file")
+    parser.add_argument("--output", type=str, required=True, help="output model file")
+    parser.add_argument("--target", type=str, required=True, choices=["other", "default"], help="target format")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    convert(args.input, args.output, args.target)

dataset/__init__.py ADDED Viewed

File without changes

dataset/config_utils.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import argparse
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+# from toolz import curry
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import toml
+import voluptuous
+from voluptuous import Any, ExactSequence, MultipleInvalid, Object, Schema
+from .image_video_dataset import DatasetGroup, ImageDataset, VideoDataset
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+@dataclass
+class BaseDatasetParams:
+    resolution: Tuple[int, int] = (960, 544)
+    enable_bucket: bool = False
+    bucket_no_upscale: bool = False
+    caption_extension: Optional[str] = None
+    batch_size: int = 1
+    num_repeats: int = 1
+    cache_directory: Optional[str] = None
+    debug_dataset: bool = False
+    architecture: str = "no_default"  # short style like "hv" or "wan"
+@dataclass
+class ImageDatasetParams(BaseDatasetParams):
+    image_directory: Optional[str] = None
+    image_jsonl_file: Optional[str] = None
+    control_directory: Optional[str] = None
+@dataclass
+class VideoDatasetParams(BaseDatasetParams):
+    video_directory: Optional[str] = None
+    video_jsonl_file: Optional[str] = None
+    control_directory: Optional[str] = None
+    target_frames: Sequence[int] = (1,)
+    frame_extraction: Optional[str] = "head"
+    frame_stride: Optional[int] = 1
+    frame_sample: Optional[int] = 1
+    max_frames: Optional[int] = 129
+    source_fps: Optional[float] = None
+@dataclass
+class DatasetBlueprint:
+    is_image_dataset: bool
+    params: Union[ImageDatasetParams, VideoDatasetParams]
+@dataclass
+class DatasetGroupBlueprint:
+    datasets: Sequence[DatasetBlueprint]
+@dataclass
+class Blueprint:
+    dataset_group: DatasetGroupBlueprint
+class ConfigSanitizer:
+    # @curry
+    @staticmethod
+    def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+        Schema(ExactSequence([klass, klass]))(value)
+        return tuple(value)
+    # @curry
+    @staticmethod
+    def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+        Schema(Any(klass, ExactSequence([klass, klass])))(value)
+        try:
+            Schema(klass)(value)
+            return (value, value)
+        except:
+            return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+    # datasets schema
+    DATASET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "batch_size": int,
+        "num_repeats": int,
+        "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+        "enable_bucket": bool,
+        "bucket_no_upscale": bool,
+    }
+    IMAGE_DATASET_DISTINCT_SCHEMA = {
+        "image_directory": str,
+        "image_jsonl_file": str,
+        "cache_directory": str,
+        "control_directory": str,
+    }
+    VIDEO_DATASET_DISTINCT_SCHEMA = {
+        "video_directory": str,
+        "video_jsonl_file": str,
+        "control_directory": str,
+        "target_frames": [int],
+        "frame_extraction": str,
+        "frame_stride": int,
+        "frame_sample": int,
+        "max_frames": int,
+        "cache_directory": str,
+        "source_fps": float,
+    }
+    # options handled by argparse but not handled by user config
+    ARGPARSE_SPECIFIC_SCHEMA = {
+        "debug_dataset": bool,
+    }
+    def __init__(self) -> None:
+        self.image_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.IMAGE_DATASET_DISTINCT_SCHEMA,
+        )
+        self.video_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.VIDEO_DATASET_DISTINCT_SCHEMA,
+        )
+        def validate_flex_dataset(dataset_config: dict):
+            if "video_directory" in dataset_config or "video_jsonl_file" in dataset_config:
+                return Schema(self.video_dataset_schema)(dataset_config)
+            else:
+                return Schema(self.image_dataset_schema)(dataset_config)
+        self.dataset_schema = validate_flex_dataset
+        self.general_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+        )
+        self.user_config_validator = Schema(
+            {
+                "general": self.general_schema,
+                "datasets": [self.dataset_schema],
+            }
+        )
+        self.argparse_schema = self.__merge_dict(
+            self.ARGPARSE_SPECIFIC_SCHEMA,
+        )
+        self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+    def sanitize_user_config(self, user_config: dict) -> dict:
+        try:
+            return self.user_config_validator(user_config)
+        except MultipleInvalid:
+            # TODO: clarify the error message
+            logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
+            raise
+    # NOTE: In nature, argument parser result is not needed to be sanitize
+    #   However this will help us to detect program bug
+    def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+        try:
+            return self.argparse_config_validator(argparse_namespace)
+        except MultipleInvalid:
+            # XXX: this should be a bug
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
+            raise
+    # NOTE: value would be overwritten by latter dict if there is already the same key
+    @staticmethod
+    def __merge_dict(*dict_list: dict) -> dict:
+        merged = {}
+        for schema in dict_list:
+            # merged |= schema
+            for k, v in schema.items():
+                merged[k] = v
+        return merged
+class BlueprintGenerator:
+    BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
+    def __init__(self, sanitizer: ConfigSanitizer):
+        self.sanitizer = sanitizer
+    # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+    def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+        sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+        sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+        argparse_config = {k: v for k, v in vars(sanitized_argparse_namespace).items() if v is not None}
+        general_config = sanitized_user_config.get("general", {})
+        dataset_blueprints = []
+        for dataset_config in sanitized_user_config.get("datasets", []):
+            is_image_dataset = "image_directory" in dataset_config or "image_jsonl_file" in dataset_config
+            if is_image_dataset:
+                dataset_params_klass = ImageDatasetParams
+            else:
+                dataset_params_klass = VideoDatasetParams
+            params = self.generate_params_by_fallbacks(
+                dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
+            )
+            dataset_blueprints.append(DatasetBlueprint(is_image_dataset, params))
+        dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+        return Blueprint(dataset_group_blueprint)
+    @staticmethod
+    def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+        name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+        search_value = BlueprintGenerator.search_value
+        default_params = asdict(param_klass())
+        param_names = default_params.keys()
+        params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+        return param_klass(**params)
+    @staticmethod
+    def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
+        for cand in fallbacks:
+            value = cand.get(key)
+            if value is not None:
+                return value
+        return default_value
+# if training is True, it will return a dataset group for training, otherwise for caching
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint, training: bool = False) -> DatasetGroup:
+    datasets: List[Union[ImageDataset, VideoDataset]] = []
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.is_image_dataset:
+            dataset_klass = ImageDataset
+        else:
+            dataset_klass = VideoDataset
+        dataset = dataset_klass(**asdict(dataset_blueprint.params))
+        datasets.append(dataset)
+    # assertion
+    cache_directories = [dataset.cache_directory for dataset in datasets]
+    num_of_unique_cache_directories = len(set(cache_directories))
+    if num_of_unique_cache_directories != len(cache_directories):
+        raise ValueError(
+            "cache directory should be unique for each dataset (note that cache directory is image/video directory if not specified)"
+            + " / cache directory は各データセットごとに異なる必要があります（指定されていない場合はimage/video directoryが使われるので注意）"
+        )
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_image_dataset = isinstance(dataset, ImageDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        is_image_dataset: {is_image_dataset}
+        resolution: {dataset.resolution}
+        batch_size: {dataset.batch_size}
+        num_repeats: {dataset.num_repeats}
+        caption_extension: "{dataset.caption_extension}"
+        enable_bucket: {dataset.enable_bucket}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+        cache_directory: "{dataset.cache_directory}"
+        debug_dataset: {dataset.debug_dataset}
+    """
+        )
+        if is_image_dataset:
+            info += indent(
+                dedent(
+                    f"""\
+        image_directory: "{dataset.image_directory}"
+        image_jsonl_file: "{dataset.image_jsonl_file}"
+        control_directory: "{dataset.control_directory}"
+    \n"""
+                ),
+                "    ",
+            )
+        else:
+            info += indent(
+                dedent(
+                    f"""\
+        video_directory: "{dataset.video_directory}"
+        video_jsonl_file: "{dataset.video_jsonl_file}"
+        control_directory: "{dataset.control_directory}"
+        target_frames: {dataset.target_frames}
+        frame_extraction: {dataset.frame_extraction}
+        frame_stride: {dataset.frame_stride}
+        frame_sample: {dataset.frame_sample}
+        max_frames: {dataset.max_frames}
+        source_fps: {dataset.source_fps}
+    \n"""
+                ),
+                "    ",
+            )
+    logger.info(f"{info}")
+    # make buckets first because it determines the length of dataset
+    # and set the same seed for all datasets
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
+    for i, dataset in enumerate(datasets):
+        # logger.info(f"[Dataset {i}]")
+        dataset.set_seed(seed)
+        if training:
+            dataset.prepare_for_training()
+    return DatasetGroup(datasets)
+def load_user_config(file: str) -> dict:
+    file: Path = Path(file)
+    if not file.is_file():
+        raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+    if file.name.lower().endswith(".json"):
+        try:
+            with open(file, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        except Exception:
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    elif file.name.lower().endswith(".toml"):
+        try:
+            config = toml.load(file)
+        except Exception:
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    else:
+        raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
+    return config
+# for config test
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset_config")
+    config_args, remain = parser.parse_known_args()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--debug_dataset", action="store_true")
+    argparse_namespace = parser.parse_args(remain)
+    logger.info("[argparse_namespace]")
+    logger.info(f"{vars(argparse_namespace)}")
+    user_config = load_user_config(config_args.dataset_config)
+    logger.info("")
+    logger.info("[user_config]")
+    logger.info(f"{user_config}")
+    sanitizer = ConfigSanitizer()
+    sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+    logger.info("")
+    logger.info("[sanitized_user_config]")
+    logger.info(f"{sanitized_user_config}")
+    blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+    logger.info("")
+    logger.info("[blueprint]")
+    logger.info(f"{blueprint}")
+    dataset_group = generate_dataset_group_by_blueprint(blueprint.dataset_group)

dataset/dataset_config.md ADDED Viewed

	@@ -0,0 +1,486 @@

+> 📝 Click on the language section to expand / 言語をクリックして展開
+## Dataset Configuration
+Please create a TOML file for dataset configuration.
+Image and video datasets are supported. The configuration file can include multiple datasets, either image or video datasets, with caption text files or metadata JSONL files.
+The cache directory must be different for each dataset.
+Each video is extracted frame by frame without additional processing and used for training. It is recommended to use videos with a frame rate of 24fps for HunyuanVideo, 16fps for Wan2.1 and 30fps for FramePack. You can check the videos that will be trained using `--debug_mode video` when caching latent (see [here](/README.md#latent-caching)).
+<details>
+<summary>日本語</summary>
+データセットの設定を行うためのTOMLファイルを作成してください。
+画像データセットと動画データセットがサポートされています。設定ファイルには、画像または動画データセットを複数含めることができます。キャプションテキストファイルまたはメタデータJSONLファイルを使用できます。
+キャッシュディレクトリは、各データセットごとに異なるディレクトリである必要があります。
+動画は追加のプロセスなしでフレームごとに抽出され、学習に用いられます。そのため、HunyuanVideoは24fps、Wan2.1は16fps、FramePackは30fpsのフレームレートの動画を使用することをお勧めします。latentキャッシュ時の`--debug_mode video`を使用すると、学習される動画を確認できます（[こちら](/README.ja.md#latentの事前キャッシュ)を参照）。
+</details>
+### Sample for Image Dataset with Caption Text Files
+```toml
+# resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# otherwise, the default values will be used for each item
+# general configurations
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+image_directory = "/path/to/image_dir"
+cache_directory = "/path/to/cache_directory"
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
+# other datasets can be added here. each dataset can have different configurations
+```
+`cache_directory` is optional, default is None to use the same directory as the image directory. However, we recommend to set the cache directory to avoid accidental sharing of the cache files between different datasets.
+`num_repeats` is also available. It is optional, default is 1 (no repeat). It repeats the images (or videos) that many times to expand the dataset. For example, if `num_repeats = 2` and there are 20 images in the dataset, each image will be duplicated twice (with the same caption) to have a total of 40 images. It is useful to balance the multiple datasets with different sizes.
+<details>
+<summary>日本語</summary>
+`cache_directory` はオプションです。デフォルトは画像ディレクトリと同じディレクトリに設定されます。ただし、異なるデータセット間でキャッシュファイルが共有されるのを防ぐために、明示的に別のキャッシュディレクトリを設定することをお勧めします。
+`num_repeats` はオプションで、デフォルトは 1 です（繰り返しなし）。画像（や動画）を、その回数だけ単純に繰り返してデータセットを拡張します。たとえば`num_repeats = 2`としたとき、画像20枚のデータセットなら、各画像が2枚ずつ（同一のキャプションで）計40枚存在した場合と同じになります。異なるデータ数のデータセット間でバランスを取るために使用可能です。
+resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
+`[[datasets]]`以下を追加することで、他のデータセットを追加できます。各データセットには異なる設定を持てます。
+</details>
+### Sample for Image Dataset with Metadata JSONL File
+```toml
+# resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# caption_extension is not required for metadata jsonl file
+# cache_directory is required for each dataset with metadata jsonl file
+# general configurations
+[general]
+resolution = [960, 544]
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl"
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+num_repeats = 1 # optional, default is 1. Same as above.
+# other datasets can be added here. each dataset can have different configurations
+```
+JSONL file format for metadata:
+```json
+{"image_path": "/path/to/image1.jpg", "caption": "A caption for image1"}
+{"image_path": "/path/to/image2.jpg", "caption": "A caption for image2"}
+```
+<details>
+<summary>日本語</summary>
+resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
+metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+キャプションによるデータセットと同様に、複数のデータセットを追加できます。各データセットには異なる設定を持てます。
+</details>
+### Sample for Video Dataset with Caption Text Files
+```toml
+# Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
+# can be set in either general or datasets sections
+# Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
+# must be set in each datasets section
+# general configurations
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+video_directory = "/path/to/video_dir"
+cache_directory = "/path/to/cache_directory" # recommended to set cache directory
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+source_fps = 30.0 # optional, source fps for videos in the directory, decimal number
+[[datasets]]
+video_directory = "/path/to/video_dir2"
+cache_directory = "/path/to/cache_directory2" # recommended to set cache directory
+frame_extraction = "full"
+max_frames = 45
+# other datasets can be added here. each dataset can have different configurations
+```
+__In HunyuanVideo and Wan2.1, the number of `target_frames` must be "N\*4+1" (N=0,1,2,...).__ Otherwise, it will be truncated to the nearest "N*4+1".
+In FramePack, it is recommended to set `frame_extraction` to `full` and `max_frames` to a sufficiently large value, as it can handle longer videos. However, if the video is too long, an Out of Memory error may occur during VAE encoding. The videos in FramePack are trimmed to "N * latent_window_size * 4 + 1" frames (for example, 37, 73, 109... if `latent_window_size` is 9).
+If the `source_fps` is specified, the videos in the directory are considered to be at this frame rate, and some frames will be skipped to match the model's frame rate (24 for HunyuanVideo and 16 for Wan2.1). __The value must be a decimal number, for example, `30.0` instead of `30`.__ The skipping is done automatically and does not consider the content of the images. Please check if the converted data is correct using `--debug_mode video`.
+If `source_fps` is not specified (default), all frames of the video will be used regardless of the video's frame rate.
+<details>
+<summary>日本語</summary>
+共通パラメータ（resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale）は、generalまたはdatasetsのいずれかに設定できます。
+動画固有のパラメータ（target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps）は、各datasetsセクションに設定する必要があります。
+__HunyuanVideoおよびWan2.1では、target_framesの数値は「N\*4+1」である必要があります。__ これ以外の値の場合は、最も近いN\*4+1の値に切り捨てられます。
+FramePackでも同様ですが、FramePackでは動画が長くても学習可能なため、 `frame_extraction`に`full` を指定し、`max_frames`を十分に大きな値に設定することをお勧めします。ただし、あまりにも長すぎるとVAEのencodeでOut of Memoryエラーが発生する可能性があります。FramePackの動画は、「N * latent_window_size * 4 + 1」フレームにトリミングされます（latent_window_sizeが9の場合、37、73、109……）。
+`source_fps`を指定した場合、ディレクトリ内の動画をこのフレームレートとみなして、モデルのフレームレートにあうようにいくつかのフレームをスキップします（HunyuanVideoは24、Wan2.1は16）。__小数点を含む数値で指定してください。__ 例：`30`ではなく`30.0`。スキップは機械的に行われ、画像の内容は考慮しません。変換後のデータが正しいか、`--debug_mode video`で確認してください。
+`source_fps`を指定しない場合、動画のフレームは（動画自体のフレームレートに関係なく）すべて使用されます。
+他の注意事項は画像データセットと同様です。
+</details>
+### Sample for Video Dataset with Metadata JSONL File
+```toml
+# Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
+# can be set in either general or datasets sections
+# Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
+# must be set in each datasets section
+# caption_extension is not required for metadata jsonl file
+# cache_directory is required for each dataset with metadata jsonl file
+# general configurations
+[general]
+resolution = [960, 544]
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl"
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+cache_directory = "/path/to/cache_directory_head"
+source_fps = 30.0 # optional, source fps for videos in the jsonl file
+# same metadata jsonl file can be used for multiple datasets
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl"
+target_frames = [1]
+frame_stride = 10
+cache_directory = "/path/to/cache_directory_stride"
+# other datasets can be added here. each dataset can have different configurations
+```
+JSONL file format for metadata:
+```json
+{"video_path": "/path/to/video1.mp4", "caption": "A caption for video1"}
+{"video_path": "/path/to/video2.mp4", "caption": "A caption for video2"}
+```
+`video_path` can be a directory containing multiple images.
+<details>
+<summary>日本語</summary>
+metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+`video_path`は、複数の画像を含むディレクトリのパスでも構いません。
+他の注意事項は今までのデータセットと同様です。
+</details>
+### frame_extraction Options
+- `head`: Extract the first N frames from the video.
+- `chunk`: Extract frames by splitting the video into chunks of N frames.
+- `slide`: Extract frames from the video with a stride of `frame_stride`.
+- `uniform`: Extract `frame_sample` samples uniformly from the video.
+- `full`: Extract all frames from the video.
+In the case of `full`, the entire video is used, but it is trimmed to "N*4+1" frames. It is also trimmed to the `max_frames` if it exceeds that value. To avoid Out of Memory errors, please set `max_frames`.
+The frame extraction methods other than `full` are recommended when the video contains repeated actions. `full` is recommended when each video represents a single complete motion.
+For example, consider a video with 40 frames. The following diagrams illustrate each extraction:
+<details>
+<summary>日本語</summary>
+- `head`: 動画から最初のNフレームを抽出します。
+- `chunk`: 動画をNフレームずつに分割してフレームを抽出します。
+- `slide`: `frame_stride`に指定したフレームごとに動画からNフレームを抽出します。
+- `uniform`: 動画から一定間隔で、`frame_sample`個のNフレームを抽出します。
+- `full`: 動画から全てのフレームを抽出します。
+`full`の場合、各動画の全体を用いますが、「N*4+1」のフレーム数にトリミングされます。また`max_frames`を超える場合もその値にトリミングされます。Out of Memoryエラーを避けるために、`max_frames`を設定してください。
+`full`以外の抽出方法は、動画が特定の動作を繰り返している場合にお勧めします。`full`はそれぞれの動画がひとつの完結したモーションの場合にお勧めします。
+例えば、40フレームの動画を例とした抽出について、以下の図で説明します。
+</details>
+```
+Original Video, 40 frames: x = frame, o = no frame
+oooooooooooooooooooooooooooooooooooooooo
+head, target_frames = [1, 13, 25] -> extract head frames:
+xooooooooooooooooooooooooooooooooooooooo
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+chunk, target_frames = [13, 25] -> extract frames by splitting into chunks, into 13 and 25 frames:
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+oooooooooooooxxxxxxxxxxxxxoooooooooooooo
+ooooooooooooooooooooooooooxxxxxxxxxxxxxo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+注: frame_extraction "chunk" を使用する場合、target_frames に 1 を含めないでください。全てのフレームが抽出されてしまいます。
+slide, target_frames = [1, 13, 25], frame_stride = 10 -> extract N frames with a stride of 10:
+xooooooooooooooooooooooooooooooooooooooo
+ooooooooooxooooooooooooooooooooooooooooo
+ooooooooooooooooooooxooooooooooooooooooo
+ooooooooooooooooooooooooooooooxooooooooo
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+ooooooooooxxxxxxxxxxxxxooooooooooooooooo
+ooooooooooooooooooooxxxxxxxxxxxxxooooooo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
+uniform, target_frames =[1, 13, 25], frame_sample = 4 -> extract `frame_sample` samples uniformly, N frames each:
+xooooooooooooooooooooooooooooooooooooooo
+oooooooooooooxoooooooooooooooooooooooooo
+oooooooooooooooooooooooooxoooooooooooooo
+ooooooooooooooooooooooooooooooooooooooox
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+oooooooooxxxxxxxxxxxxxoooooooooooooooooo
+ooooooooooooooooooxxxxxxxxxxxxxooooooooo
+oooooooooooooooooooooooooooxxxxxxxxxxxxx
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+oooooxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooo
+ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
+oooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxx
+Three Original Videos, 20, 25, 35 frames: x = frame, o = no frame
+full, max_frames = 31 -> extract all frames (trimmed to the maximum length):
+video1: xxxxxxxxxxxxxxxxx (trimmed to 17 frames)
+video2: xxxxxxxxxxxxxxxxxxxxxxxxx (25 frames)
+video3: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx (trimmed to 31 frames)
+```
+### Sample for Image Dataset with Control Images
+The dataset with control images is used for training the single frame training for FramePack.
+The dataset configuration with caption text files is similar to the image dataset, but with an additional `control_directory` parameter.
+The control images are used from the `control_directory` with the same filename (or different extension) as the image, for example, `image_dir/image1.jpg` and `control_dir/image1.png`. The images in `image_directory` should be the target images (the images to be generated during inference, the changed images). The `control_directory` should contain the starting images for inference. The captions should be stored in `image_directory`.
+The metadata JSONL file format is the same as the image dataset, but with an additional `control_path` parameter.
+```json
+{"image_path": "/path/to/image1.jpg", "control_path": "/path/to/control1.png", "caption": "A caption for image1"}
+{"image_path": "/path/to/image2.jpg", "control_path": "/path/to/control2.png", "caption": "A caption for image2"}
+```
+<details>
+<summary>日本語</summary>
+制御画像を持つデータセットです。FramePackの単一フレーム学習に使用します。
+キャプションファイルを用いる場合は`control_directory`を追加で指定してください。制御用画像は、画像と同じファイル名（または拡張子のみが異なるファイル名）の、`control_directory`にある画像が使用されます（例：`image_dir/image1.jpg`と`control_dir/image1.png`）。`image_directory`の画像は学習対象の画像（推論時に生成する画像、変化後の画像）としてください。`control_directory`には推論時の開始画像を格納してください。キャプションは`image_directory`へ格納してください。
+メタデータJSONLファイルを使用する場合は、`control_path`を追加してください。
+</details>
+### Sample for Video Dataset with Control Images
+The dataset with control videos is used for training ControlNet models.
+The dataset configuration with caption text files is similar to the video dataset, but with an additional `control_directory` parameter.
+The control video for a video is used from the `control_directory` with the same filename (or different extension) as the video, for example, `video_dir/video1.mp4` and `control_dir/video1.mp4` or `control_dir/video1.mov`. The control video can also be a directory without an extension, for example, `video_dir/video1.mp4` and `control_dir/video1`.
+```toml
+[[datasets]]
+video_directory = "/path/to/video_dir"
+control_directory = "/path/to/control_dir" # required for dataset with control videos
+cache_directory = "/path/to/cache_directory" # recommended to set cache directory
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+```
+The dataset configuration with metadata JSONL file is  same as the video dataset, but metadata JSONL file must include the control video paths. The control video path can be a directory containing multiple images.
+```json
+{"video_path": "/path/to/video1.mp4", "control_path": "/path/to/control1.mp4", "caption": "A caption for video1"}
+{"video_path": "/path/to/video2.mp4", "control_path": "/path/to/control2.mp4", "caption": "A caption for video2"}
+```
+<details>
+<summary>日本語</summary>
+制御動画を持つデータセットです。ControlNetモデルの学習に使用します。
+キャプションを用いる場合のデータセット設定は動画データセットと似ていますが、`control_directory`パラメータが追加されています。上にある例を参照してください。ある動画に対する制御用動画として、動画と同じファイル名（または拡張子のみが異なるファイル名）の、`control_directory`にある動画が使用されます（例：`video_dir/video1.mp4`と`control_dir/video1.mp4`または`control_dir/video1.mov`）。また、拡張子なしのディレクトリ内の、複数枚の画像を制御用動画として使用することもできます（例：`video_dir/video1.mp4`と`control_dir/video1`）。
+データセット設定でメタデータJSONLファイルを使用する場合は、動画と制御用動画のパスを含める必要があります。制御用動画のパスは、複数枚の画像を含むディレクトリのパスでも構いません。
+</details>
+## Specifications
+```toml
+# general configurations
+[general]
+resolution = [960, 544] # optional, [W, H], default is [960, 544]. This is the default resolution for all datasets
+caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
+batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
+enable_bucket = true # optional, default is false. Enable bucketing for datasets
+bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+### Image Dataset
+# sample image dataset with caption text files
+[[datasets]]
+image_directory = "/path/to/image_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+batch_size = 4 # optional, overwrite the default batch size
+num_repeats = 1 # optional, overwrite the default num_repeats
+enable_bucket = false # optional, overwrite the default bucketing setting
+bucket_no_upscale = true # optional, overwrite the default bucketing setting
+cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+control_directory = "/path/to/control_dir" # optional, required for dataset with control images
+# sample image dataset with metadata **jsonl** file
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
+resolution = [960, 544] # required if general resolution is not set
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# caption_extension is not required for metadata jsonl file
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+### Video Dataset
+# sample video dataset with caption text files
+[[datasets]]
+video_directory = "/path/to/video_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+control_directory = "/path/to/control_dir" # optional, required for dataset with control images
+# following configurations must be set in each [[datasets]] section for video datasets
+target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+# NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
+frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
+frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+max_frames = 129 # optional, default is 129. Maximum number of frames to extract, available for "full" frame extraction
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
+# sample video dataset with metadata jsonl file
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
+target_frames = [1, 79]
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample, max_frames are also available for metadata jsonl file
+```
+<!--
+# sample image dataset with lance
+[[datasets]]
+image_lance_dataset = "/path/to/lance_dataset"
+resolution = [960, 544] # required if general resolution is not set
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+-->
+The metadata with .json file will be supported in the near future.
+<!--
+```toml
+# general configurations
+[general]
+resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
+caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
+batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+enable_bucket = true # optional, default is false. Enable bucketing for datasets
+bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+# sample image dataset with caption text files
+[[datasets]]
+image_directory = "/path/to/image_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+batch_size = 4 # optional, overwrite the default batch size
+enable_bucket = false # optional, overwrite the default bucketing setting
+bucket_no_upscale = true # optional, overwrite the default bucketing setting
+cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+# sample image dataset with metadata **jsonl** file
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
+resolution = [960, 544] # required if general resolution is not set
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# caption_extension is not required for metadata jsonl file
+# batch_size, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+# sample video dataset with caption text files
+[[datasets]]
+video_directory = "/path/to/video_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
+frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
+frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
+# sample video dataset with metadata jsonl file
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
+target_frames = [1, 79]
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
+```
+# sample image dataset with lance
+[[datasets]]
+image_lance_dataset = "/path/to/lance_dataset"
+resolution = [960, 544] # required if general resolution is not set
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+The metadata with .json file will be supported in the near future.
+-->

dataset/image_video_dataset.py ADDED Viewed

	@@ -0,0 +1,1786 @@

+from concurrent.futures import ThreadPoolExecutor
+import glob
+import json
+import math
+import os
+import random
+import time
+from typing import Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from safetensors.torch import save_file, load_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import av
+from utils import safetensors_utils
+from utils.model_utils import dtype_to_str
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".PNG", ".JPG", ".JPEG", ".WEBP", ".BMP"]
+try:
+    import pillow_avif
+    IMAGE_EXTENSIONS.extend([".avif", ".AVIF"])
+except:
+    pass
+# JPEG-XL on Linux
+try:
+    from jxlpy import JXLImagePlugin
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+# JPEG-XL on Windows
+try:
+    import pillow_jxl
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+VIDEO_EXTENSIONS = [
+    ".mp4",
+    ".webm",
+    ".avi",
+    ".mkv",
+    ".mov",
+    ".flv",
+    ".wmv",
+    ".m4v",
+    ".mpg",
+    ".mpeg",
+    ".MP4",
+    ".WEBM",
+    ".AVI",
+    ".MKV",
+    ".MOV",
+    ".FLV",
+    ".WMV",
+    ".M4V",
+    ".MPG",
+    ".MPEG",
+]  # some of them are not tested
+ARCHITECTURE_HUNYUAN_VIDEO = "hv"
+ARCHITECTURE_HUNYUAN_VIDEO_FULL = "hunyuan_video"
+ARCHITECTURE_WAN = "wan"
+ARCHITECTURE_WAN_FULL = "wan"
+ARCHITECTURE_FRAMEPACK = "fp"
+ARCHITECTURE_FRAMEPACK_FULL = "framepack"
+def glob_images(directory, base="*"):
+    img_paths = []
+    for ext in IMAGE_EXTENSIONS:
+        if base == "*":
+            img_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
+        else:
+            img_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
+    img_paths = list(set(img_paths))  # remove duplicates
+    img_paths.sort()
+    return img_paths
+def glob_videos(directory, base="*"):
+    video_paths = []
+    for ext in VIDEO_EXTENSIONS:
+        if base == "*":
+            video_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
+        else:
+            video_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
+    video_paths = list(set(video_paths))  # remove duplicates
+    video_paths.sort()
+    return video_paths
+def divisible_by(num: int, divisor: int) -> int:
+    return num - num % divisor
+def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
+    """
+    Resize the image to the bucket resolution.
+    bucket_reso: **(width, height)**
+    """
+    is_pil_image = isinstance(image, Image.Image)
+    if is_pil_image:
+        image_width, image_height = image.size
+    else:
+        image_height, image_width = image.shape[:2]
+    if bucket_reso == (image_width, image_height):
+        return np.array(image) if is_pil_image else image
+    bucket_width, bucket_height = bucket_reso
+    if bucket_width == image_width or bucket_height == image_height:
+        image = np.array(image) if is_pil_image else image
+    else:
+        # resize the image to the bucket resolution to match the short side
+        scale_width = bucket_width / image_width
+        scale_height = bucket_height / image_height
+        scale = max(scale_width, scale_height)
+        image_width = int(image_width * scale + 0.5)
+        image_height = int(image_height * scale + 0.5)
+        if scale > 1:
+            image = Image.fromarray(image) if not is_pil_image else image
+            image = image.resize((image_width, image_height), Image.LANCZOS)
+            image = np.array(image)
+        else:
+            image = np.array(image) if is_pil_image else image
+            image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
+    # crop the image to the bucket resolution
+    crop_left = (image_width - bucket_width) // 2
+    crop_top = (image_height - bucket_height) // 2
+    image = image[crop_top : crop_top + bucket_height, crop_left : crop_left + bucket_width]
+    return image
+class ItemInfo:
+    def __init__(
+        self,
+        item_key: str,
+        caption: str,
+        original_size: tuple[int, int],
+        bucket_size: Optional[Union[tuple[int, int], tuple[int, int, int]]] = None,
+        frame_count: Optional[int] = None,
+        content: Optional[np.ndarray] = None,
+        latent_cache_path: Optional[str] = None,
+    ) -> None:
+        self.item_key = item_key
+        self.caption = caption
+        self.original_size = original_size
+        self.bucket_size = bucket_size
+        self.frame_count = frame_count
+        self.content = content
+        self.latent_cache_path = latent_cache_path
+        self.text_encoder_output_cache_path: Optional[str] = None
+        self.control_content: Optional[np.ndarray] = None
+    def __str__(self) -> str:
+        return (
+            f"ItemInfo(item_key={self.item_key}, caption={self.caption}, "
+            + f"original_size={self.original_size}, bucket_size={self.bucket_size}, "
+            + f"frame_count={self.frame_count}, latent_cache_path={self.latent_cache_path}, content={self.content.shape if self.content is not None else None})"
+        )
+# We use simple if-else approach to support multiple architectures.
+# Maybe we can use a plugin system in the future.
+# the keys of the dict are `<content_type>_FxHxW_<dtype>` for latents
+# and `<content_type>_<dtype|mask>` for other tensors
+def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
+    """HunyuanVideo architecture only. HunyuanVideo doesn't support I2V and control latents"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
+def save_latent_cache_wan(
+    item_info: ItemInfo,
+    latent: torch.Tensor,
+    clip_embed: Optional[torch.Tensor],
+    image_latent: Optional[torch.Tensor],
+    control_latent: Optional[torch.Tensor],
+):
+    """Wan architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
+    if clip_embed is not None:
+        sd[f"clip_{dtype_str}"] = clip_embed.detach().cpu()
+    if image_latent is not None:
+        sd[f"latents_image_{F}x{H}x{W}_{dtype_str}"] = image_latent.detach().cpu()
+    if control_latent is not None:
+        sd[f"latents_control_{F}x{H}x{W}_{dtype_str}"] = control_latent.detach().cpu()
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+def save_latent_cache_framepack(
+    item_info: ItemInfo,
+    latent: torch.Tensor,
+    latent_indices: torch.Tensor,
+    clean_latents: torch.Tensor,
+    clean_latent_indices: torch.Tensor,
+    clean_latents_2x: torch.Tensor,
+    clean_latent_2x_indices: torch.Tensor,
+    clean_latents_4x: torch.Tensor,
+    clean_latent_4x_indices: torch.Tensor,
+    image_embeddings: torch.Tensor,
+):
+    """FramePack architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu().contiguous()}
+    # `latents_xxx` must have {F, H, W} suffix
+    indices_dtype_str = dtype_to_str(latent_indices.dtype)
+    sd[f"image_embeddings_{dtype_str}"] = image_embeddings.detach().cpu()  # image embeddings dtype is same as latents dtype
+    sd[f"latent_indices_{indices_dtype_str}"] = latent_indices.detach().cpu()
+    sd[f"clean_latent_indices_{indices_dtype_str}"] = clean_latent_indices.detach().cpu()
+    sd[f"clean_latent_2x_indices_{indices_dtype_str}"] = clean_latent_2x_indices.detach().cpu()
+    sd[f"clean_latent_4x_indices_{indices_dtype_str}"] = clean_latent_4x_indices.detach().cpu()
+    sd[f"latents_clean_{F}x{H}x{W}_{dtype_str}"] = clean_latents.detach().cpu().contiguous()
+    sd[f"latents_clean_2x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_2x.detach().cpu().contiguous()
+    sd[f"latents_clean_4x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_4x.detach().cpu().contiguous()
+    # for key, value in sd.items():
+    #     print(f"{key}: {value.shape}")
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
+def save_latent_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
+    metadata = {
+        "architecture": arch_fullname,
+        "width": f"{item_info.original_size[0]}",
+        "height": f"{item_info.original_size[1]}",
+        "format_version": "1.0.1",
+    }
+    if item_info.frame_count is not None:
+        metadata["frame_count"] = f"{item_info.frame_count}"
+    for key, value in sd.items():
+        # NaN check and show warning, replace NaN with 0
+        if torch.isnan(value).any():
+            logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
+            value[torch.isnan(value)] = 0
+    latent_dir = os.path.dirname(item_info.latent_cache_path)
+    os.makedirs(latent_dir, exist_ok=True)
+    save_file(sd, item_info.latent_cache_path, metadata=metadata)
+def save_text_encoder_output_cache(item_info: ItemInfo, embed: torch.Tensor, mask: Optional[torch.Tensor], is_llm: bool):
+    """HunyuanVideo architecture only"""
+    assert (
+        embed.dim() == 1 or embed.dim() == 2
+    ), f"embed should be 2D tensor (feature, hidden_size) or (hidden_size,), got {embed.shape}"
+    assert mask is None or mask.dim() == 1, f"mask should be 1D tensor (feature), got {mask.shape}"
+    sd = {}
+    dtype_str = dtype_to_str(embed.dtype)
+    text_encoder_type = "llm" if is_llm else "clipL"
+    sd[f"{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
+    if mask is not None:
+        sd[f"{text_encoder_type}_mask"] = mask.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
+def save_text_encoder_output_cache_wan(item_info: ItemInfo, embed: torch.Tensor):
+    """Wan architecture only. Wan2.1 only has a single text encoder"""
+    sd = {}
+    dtype_str = dtype_to_str(embed.dtype)
+    text_encoder_type = "t5"
+    sd[f"varlen_{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+def save_text_encoder_output_cache_framepack(
+    item_info: ItemInfo, llama_vec: torch.Tensor, llama_attention_mask: torch.Tensor, clip_l_pooler: torch.Tensor
+):
+    """FramePack architecture only."""
+    sd = {}
+    dtype_str = dtype_to_str(llama_vec.dtype)
+    sd[f"llama_vec_{dtype_str}"] = llama_vec.detach().cpu()
+    sd[f"llama_attention_mask"] = llama_attention_mask.detach().cpu()
+    dtype_str = dtype_to_str(clip_l_pooler.dtype)
+    sd[f"clip_l_pooler_{dtype_str}"] = clip_l_pooler.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
+def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
+    for key, value in sd.items():
+        # NaN check and show warning, replace NaN with 0
+        if torch.isnan(value).any():
+            logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
+            value[torch.isnan(value)] = 0
+    metadata = {
+        "architecture": arch_fullname,
+        "caption1": item_info.caption,
+        "format_version": "1.0.1",
+    }
+    if os.path.exists(item_info.text_encoder_output_cache_path):
+        # load existing cache and update metadata
+        with safetensors_utils.MemoryEfficientSafeOpen(item_info.text_encoder_output_cache_path) as f:
+            existing_metadata = f.metadata()
+            for key in f.keys():
+                if key not in sd:  # avoid overwriting by existing cache, we keep the new one
+                    sd[key] = f.get_tensor(key)
+        assert existing_metadata["architecture"] == metadata["architecture"], "architecture mismatch"
+        if existing_metadata["caption1"] != metadata["caption1"]:
+            logger.warning(f"caption mismatch: existing={existing_metadata['caption1']}, new={metadata['caption1']}, overwrite")
+        # TODO verify format_version
+        existing_metadata.pop("caption1", None)
+        existing_metadata.pop("format_version", None)
+        metadata.update(existing_metadata)  # copy existing metadata except caption and format_version
+    else:
+        text_encoder_output_dir = os.path.dirname(item_info.text_encoder_output_cache_path)
+        os.makedirs(text_encoder_output_dir, exist_ok=True)
+    safetensors_utils.mem_eff_save_file(sd, item_info.text_encoder_output_cache_path, metadata=metadata)
+class BucketSelector:
+    RESOLUTION_STEPS_HUNYUAN = 16
+    RESOLUTION_STEPS_WAN = 16
+    RESOLUTION_STEPS_FRAMEPACK = 16
+    def __init__(
+        self, resolution: Tuple[int, int], enable_bucket: bool = True, no_upscale: bool = False, architecture: str = "no_default"
+    ):
+        self.resolution = resolution
+        self.bucket_area = resolution[0] * resolution[1]
+        self.architecture = architecture
+        if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_HUNYUAN
+        elif self.architecture == ARCHITECTURE_WAN:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_WAN
+        elif self.architecture == ARCHITECTURE_FRAMEPACK:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_FRAMEPACK
+        else:
+            raise ValueError(f"Invalid architecture: {self.architecture}")
+        if not enable_bucket:
+            # only define one bucket
+            self.bucket_resolutions = [resolution]
+            self.no_upscale = False
+        else:
+            # prepare bucket resolution
+            self.no_upscale = no_upscale
+            sqrt_size = int(math.sqrt(self.bucket_area))
+            min_size = divisible_by(sqrt_size // 2, self.reso_steps)
+            self.bucket_resolutions = []
+            for w in range(min_size, sqrt_size + self.reso_steps, self.reso_steps):
+                h = divisible_by(self.bucket_area // w, self.reso_steps)
+                self.bucket_resolutions.append((w, h))
+                self.bucket_resolutions.append((h, w))
+            self.bucket_resolutions = list(set(self.bucket_resolutions))
+            self.bucket_resolutions.sort()
+        # calculate aspect ratio to find the nearest resolution
+        self.aspect_ratios = np.array([w / h for w, h in self.bucket_resolutions])
+    def get_bucket_resolution(self, image_size: tuple[int, int]) -> tuple[int, int]:
+        """
+        return the bucket resolution for the given image size, (width, height)
+        """
+        area = image_size[0] * image_size[1]
+        if self.no_upscale and area <= self.bucket_area:
+            w, h = image_size
+            w = divisible_by(w, self.reso_steps)
+            h = divisible_by(h, self.reso_steps)
+            return w, h
+        aspect_ratio = image_size[0] / image_size[1]
+        ar_errors = self.aspect_ratios - aspect_ratio
+        bucket_id = np.abs(ar_errors).argmin()
+        return self.bucket_resolutions[bucket_id]
+def load_video(
+    video_path: str,
+    start_frame: Optional[int] = None,
+    end_frame: Optional[int] = None,
+    bucket_selector: Optional[BucketSelector] = None,
+    bucket_reso: Optional[tuple[int, int]] = None,
+    source_fps: Optional[float] = None,
+    target_fps: Optional[float] = None,
+) -> list[np.ndarray]:
+    """
+    bucket_reso: if given, resize the video to the bucket resolution, (width, height)
+    """
+    if source_fps is None or target_fps is None:
+        if os.path.isfile(video_path):
+            container = av.open(video_path)
+            video = []
+            for i, frame in enumerate(container.decode(video=0)):
+                if start_frame is not None and i < start_frame:
+                    continue
+                if end_frame is not None and i >= end_frame:
+                    break
+                frame = frame.to_image()
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(frame.size)  # calc resolution from first frame
+                if bucket_reso is not None:
+                    frame = resize_image_to_bucket(frame, bucket_reso)
+                else:
+                    frame = np.array(frame)
+                video.append(frame)
+            container.close()
+        else:
+            # load images in the directory
+            image_files = glob_images(video_path)
+            image_files.sort()
+            video = []
+            for i in range(len(image_files)):
+                if start_frame is not None and i < start_frame:
+                    continue
+                if end_frame is not None and i >= end_frame:
+                    break
+                image_file = image_files[i]
+                image = Image.open(image_file).convert("RGB")
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(image.size)  # calc resolution from first frame
+                image = np.array(image)
+                if bucket_reso is not None:
+                    image = resize_image_to_bucket(image, bucket_reso)
+                video.append(image)
+    else:
+        # drop frames to match the target fps TODO commonize this code with the above if this works
+        frame_index_delta = target_fps / source_fps  # example: 16 / 30 = 0.5333
+        if os.path.isfile(video_path):
+            container = av.open(video_path)
+            video = []
+            frame_index_with_fraction = 0.0
+            previous_frame_index = -1
+            for i, frame in enumerate(container.decode(video=0)):
+                target_frame_index = int(frame_index_with_fraction)
+                frame_index_with_fraction += frame_index_delta
+                if target_frame_index == previous_frame_index:  # drop this frame
+                    continue
+                # accept this frame
+                previous_frame_index = target_frame_index
+                if start_frame is not None and target_frame_index < start_frame:
+                    continue
+                if end_frame is not None and target_frame_index >= end_frame:
+                    break
+                frame = frame.to_image()
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(frame.size)  # calc resolution from first frame
+                if bucket_reso is not None:
+                    frame = resize_image_to_bucket(frame, bucket_reso)
+                else:
+                    frame = np.array(frame)
+                video.append(frame)
+            container.close()
+        else:
+            # load images in the directory
+            image_files = glob_images(video_path)
+            image_files.sort()
+            video = []
+            frame_index_with_fraction = 0.0
+            previous_frame_index = -1
+            for i in range(len(image_files)):
+                target_frame_index = int(frame_index_with_fraction)
+                frame_index_with_fraction += frame_index_delta
+                if target_frame_index == previous_frame_index:  # drop this frame
+                    continue
+                # accept this frame
+                previous_frame_index = target_frame_index
+                if start_frame is not None and target_frame_index < start_frame:
+                    continue
+                if end_frame is not None and target_frame_index >= end_frame:
+                    break
+                image_file = image_files[i]
+                image = Image.open(image_file).convert("RGB")
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(image.size)  # calc resolution from first frame
+                image = np.array(image)
+                if bucket_reso is not None:
+                    image = resize_image_to_bucket(image, bucket_reso)
+                video.append(image)
+    return video
+class BucketBatchManager:
+    def __init__(self, bucketed_item_info: dict[Union[tuple[int, int], tuple[int, int, int]], list[ItemInfo]], batch_size: int):
+        self.batch_size = batch_size
+        self.buckets = bucketed_item_info
+        self.bucket_resos = list(self.buckets.keys())
+        self.bucket_resos.sort()
+        # indices for enumerating batches. each batch is reso + batch_idx. reso is (width, height) or (width, height, frames)
+        self.bucket_batch_indices: list[tuple[Union[tuple[int, int], tuple[int, int, int], int]]] = []
+        for bucket_reso in self.bucket_resos:
+            bucket = self.buckets[bucket_reso]
+            num_batches = math.ceil(len(bucket) / self.batch_size)
+            for i in range(num_batches):
+                self.bucket_batch_indices.append((bucket_reso, i))
+        # do no shuffle here to avoid multiple datasets have different order
+        # self.shuffle()
+    def show_bucket_info(self):
+        for bucket_reso in self.bucket_resos:
+            bucket = self.buckets[bucket_reso]
+            logger.info(f"bucket: {bucket_reso}, count: {len(bucket)}")
+        logger.info(f"total batches: {len(self)}")
+    def shuffle(self):
+        # shuffle each bucket
+        for bucket in self.buckets.values():
+            random.shuffle(bucket)
+        # shuffle the order of batches
+        random.shuffle(self.bucket_batch_indices)
+    def __len__(self):
+        return len(self.bucket_batch_indices)
+    def __getitem__(self, idx):
+        bucket_reso, batch_idx = self.bucket_batch_indices[idx]
+        bucket = self.buckets[bucket_reso]
+        start = batch_idx * self.batch_size
+        end = min(start + self.batch_size, len(bucket))
+        batch_tensor_data = {}
+        varlen_keys = set()
+        for item_info in bucket[start:end]:
+            sd_latent = load_file(item_info.latent_cache_path)
+            sd_te = load_file(item_info.text_encoder_output_cache_path)
+            sd = {**sd_latent, **sd_te}
+            # TODO refactor this
+            for key in sd.keys():
+                is_varlen_key = key.startswith("varlen_")  # varlen keys are not stacked
+                content_key = key
+                if is_varlen_key:
+                    content_key = content_key.replace("varlen_", "")
+                if content_key.endswith("_mask"):
+                    pass
+                else:
+                    content_key = content_key.rsplit("_", 1)[0]  # remove dtype
+                    if content_key.startswith("latents_"):
+                        content_key = content_key.rsplit("_", 1)[0]  # remove FxHxW
+                if content_key not in batch_tensor_data:
+                    batch_tensor_data[content_key] = []
+                batch_tensor_data[content_key].append(sd[key])
+                if is_varlen_key:
+                    varlen_keys.add(content_key)
+        for key in batch_tensor_data.keys():
+            if key not in varlen_keys:
+                batch_tensor_data[key] = torch.stack(batch_tensor_data[key])
+        return batch_tensor_data
+class ContentDatasource:
+    def __init__(self):
+        self.caption_only = False  # set to True to only fetch caption for Text Encoder caching
+        self.has_control = False
+    def set_caption_only(self, caption_only: bool):
+        self.caption_only = caption_only
+    def is_indexable(self):
+        return False
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        """
+        Returns caption. May not be called if is_indexable() returns False.
+        """
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+    def __iter__(self):
+        raise NotImplementedError
+    def __next__(self):
+        raise NotImplementedError
+class ImageDatasource(ContentDatasource):
+    def __init__(self):
+        super().__init__()
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        """
+        Returns image data as a tuple of image path, image, and caption for the given index.
+        Key must be unique and valid as a file name.
+        May not be called if is_indexable() returns False.
+        """
+        raise NotImplementedError
+class ImageDirectoryDatasource(ImageDatasource):
+    def __init__(self, image_directory: str, caption_extension: Optional[str] = None, control_directory: Optional[str] = None):
+        super().__init__()
+        self.image_directory = image_directory
+        self.caption_extension = caption_extension
+        self.control_directory = control_directory
+        self.current_idx = 0
+        # glob images
+        logger.info(f"glob images in {self.image_directory}")
+        self.image_paths = glob_images(self.image_directory)
+        logger.info(f"found {len(self.image_paths)} images")
+        # glob control images if specified
+        if self.control_directory is not None:
+            logger.info(f"glob control images in {self.control_directory}")
+            self.has_control = True
+            self.control_paths = {}
+            for image_path in self.image_paths:
+                image_basename = os.path.basename(image_path)
+                control_path = os.path.join(self.control_directory, image_basename)
+                if os.path.exists(control_path):
+                    self.control_paths[image_path] = control_path
+                else:
+                    # another extension for control path
+                    # for example: image_path = "img/image.png" -> control_path = "control/image.jpg"
+                    image_basename_no_ext = os.path.splitext(image_basename)[0]
+                    for ext in IMAGE_EXTENSIONS:
+                        potential_path = os.path.join(self.control_directory, image_basename_no_ext + ext)
+                        if os.path.exists(potential_path):
+                            self.control_paths[image_path] = potential_path
+                            break
+            logger.info(f"found {len(self.control_paths)} matching control images")
+            missing_controls = len(self.image_paths) - len(self.control_paths)
+            if missing_controls > 0:
+                missing_control_paths = set(self.image_paths) - set(self.control_paths.keys())
+                logger.error(f"Could not find matching control images for {missing_controls} images: {missing_control_paths}")
+                raise ValueError(f"Could not find matching control images for {missing_controls} images")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.image_paths)
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str, Optional[Image.Image]]:
+        image_path = self.image_paths[idx]
+        image = Image.open(image_path).convert("RGB")
+        _, caption = self.get_caption(idx)
+        control = None
+        if self.has_control:
+            control_path = self.control_paths[image_path]
+            control = Image.open(control_path).convert("RGB")
+        return image_path, image, caption, control
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        image_path = self.image_paths[idx]
+        caption_path = os.path.splitext(image_path)[0] + self.caption_extension if self.caption_extension else ""
+        with open(caption_path, "r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return image_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self) -> callable:
+        """
+        Returns a fetcher function that returns image data.
+        """
+        if self.current_idx >= len(self.image_paths):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_image_fetcher(index):
+                return lambda: self.get_image_data(index)
+            fetcher = create_image_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class ImageJsonlDatasource(ImageDatasource):
+    def __init__(self, image_jsonl_file: str):
+        super().__init__()
+        self.image_jsonl_file = image_jsonl_file
+        self.current_idx = 0
+        # load jsonl
+        logger.info(f"load image jsonl from {self.image_jsonl_file}")
+        self.data = []
+        with open(self.image_jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    data = json.loads(line)
+                except json.JSONDecodeError:
+                    logger.error(f"failed to load json: {line} @ {self.image_jsonl_file}")
+                    raise
+                self.data.append(data)
+        logger.info(f"loaded {len(self.data)} images")
+        # Check if there are control paths in the JSONL
+        self.has_control = any("control_path" in item for item in self.data)
+        if self.has_control:
+            control_count = sum(1 for item in self.data if "control_path" in item)
+            if control_count < len(self.data):
+                missing_control_images = [item["image_path"] for item in self.data if "control_path" not in item]
+                logger.error(f"Some images do not have control paths in JSONL data: {missing_control_images}")
+                raise ValueError(f"Some images do not have control paths in JSONL data: {missing_control_images}")
+            logger.info(f"found {control_count} control images in JSONL data")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.data)
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str, Optional[Image.Image]]:
+        data = self.data[idx]
+        image_path = data["image_path"]
+        image = Image.open(image_path).convert("RGB")
+        caption = data["caption"]
+        control = None
+        if self.has_control:
+            control_path = data["control_path"]
+            control = Image.open(control_path).convert("RGB")
+        return image_path, image, caption, control
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        data = self.data[idx]
+        image_path = data["image_path"]
+        caption = data["caption"]
+        return image_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self) -> callable:
+        if self.current_idx >= len(self.data):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_fetcher(index):
+                return lambda: self.get_image_data(index)
+            fetcher = create_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class VideoDatasource(ContentDatasource):
+    def __init__(self):
+        super().__init__()
+        # None means all frames
+        self.start_frame = None
+        self.end_frame = None
+        self.bucket_selector = None
+        self.source_fps = None
+        self.target_fps = None
+    def __len__(self):
+        raise NotImplementedError
+    def get_video_data_from_path(
+        self,
+        video_path: str,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str]:
+        # this method can resize the video if bucket_selector is given to reduce the memory usage
+        start_frame = start_frame if start_frame is not None else self.start_frame
+        end_frame = end_frame if end_frame is not None else self.end_frame
+        bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+        video = load_video(
+            video_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
+        )
+        return video
+    def get_control_data_from_path(
+        self,
+        control_path: str,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> list[Image.Image]:
+        start_frame = start_frame if start_frame is not None else self.start_frame
+        end_frame = end_frame if end_frame is not None else self.end_frame
+        bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+        control = load_video(
+            control_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
+        )
+        return control
+    def set_start_and_end_frame(self, start_frame: Optional[int], end_frame: Optional[int]):
+        self.start_frame = start_frame
+        self.end_frame = end_frame
+    def set_bucket_selector(self, bucket_selector: BucketSelector):
+        self.bucket_selector = bucket_selector
+    def set_source_and_target_fps(self, source_fps: Optional[float], target_fps: Optional[float]):
+        self.source_fps = source_fps
+        self.target_fps = target_fps
+    def __iter__(self):
+        raise NotImplementedError
+    def __next__(self):
+        raise NotImplementedError
+class VideoDirectoryDatasource(VideoDatasource):
+    def __init__(self, video_directory: str, caption_extension: Optional[str] = None, control_directory: Optional[str] = None):
+        super().__init__()
+        self.video_directory = video_directory
+        self.caption_extension = caption_extension
+        self.control_directory = control_directory  # 新しく追加: コントロール画像ディレクトリ
+        self.current_idx = 0
+        # glob videos
+        logger.info(f"glob videos in {self.video_directory}")
+        self.video_paths = glob_videos(self.video_directory)
+        logger.info(f"found {len(self.video_paths)} videos")
+        # glob control images if specified
+        if self.control_directory is not None:
+            logger.info(f"glob control videos in {self.control_directory}")
+            self.has_control = True
+            self.control_paths = {}
+            for video_path in self.video_paths:
+                video_basename = os.path.basename(video_path)
+                # construct control path from video path
+                # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mp4"
+                control_path = os.path.join(self.control_directory, video_basename)
+                if os.path.exists(control_path):
+                    self.control_paths[video_path] = control_path
+                else:
+                    # use the same base name for control path
+                    base_name = os.path.splitext(video_basename)[0]
+                    # directory with images. for example: video_path = "vid/video.mp4" -> control_path = "control/video"
+                    potential_path = os.path.join(self.control_directory, base_name)  # no extension
+                    if os.path.isdir(potential_path):
+                        self.control_paths[video_path] = potential_path
+                    else:
+                        # another extension for control path
+                        # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mov"
+                        for ext in VIDEO_EXTENSIONS:
+                            potential_path = os.path.join(self.control_directory, base_name + ext)
+                            if os.path.exists(potential_path):
+                                self.control_paths[video_path] = potential_path
+                                break
+            logger.info(f"found {len(self.control_paths)} matching control videos/images")
+            # check if all videos have matching control paths, if not, raise an error
+            missing_controls = len(self.video_paths) - len(self.control_paths)
+            if missing_controls > 0:
+                # logger.warning(f"Could not find matching control videos/images for {missing_controls} videos")
+                missing_controls_videos = [video_path for video_path in self.video_paths if video_path not in self.control_paths]
+                logger.error(
+                    f"Could not find matching control videos/images for {missing_controls} videos: {missing_controls_videos}"
+                )
+                raise ValueError(f"Could not find matching control videos/images for {missing_controls} videos")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.video_paths)
+    def get_video_data(
+        self,
+        idx: int,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
+        video_path = self.video_paths[idx]
+        video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
+        _, caption = self.get_caption(idx)
+        control = None
+        if self.control_directory is not None and video_path in self.control_paths:
+            control_path = self.control_paths[video_path]
+            control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
+        return video_path, video, caption, control
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        video_path = self.video_paths[idx]
+        caption_path = os.path.splitext(video_path)[0] + self.caption_extension if self.caption_extension else ""
+        with open(caption_path, "r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return video_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self):
+        if self.current_idx >= len(self.video_paths):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_fetcher(index):
+                return lambda: self.get_video_data(index)
+            fetcher = create_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class VideoJsonlDatasource(VideoDatasource):
+    def __init__(self, video_jsonl_file: str):
+        super().__init__()
+        self.video_jsonl_file = video_jsonl_file
+        self.current_idx = 0
+        # load jsonl
+        logger.info(f"load video jsonl from {self.video_jsonl_file}")
+        self.data = []
+        with open(self.video_jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                self.data.append(data)
+        logger.info(f"loaded {len(self.data)} videos")
+        # Check if there are control paths in the JSONL
+        self.has_control = any("control_path" in item for item in self.data)
+        if self.has_control:
+            control_count = sum(1 for item in self.data if "control_path" in item)
+            if control_count < len(self.data):
+                missing_control_videos = [item["video_path"] for item in self.data if "control_path" not in item]
+                logger.error(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
+                raise ValueError(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
+            logger.info(f"found {control_count} control videos/images in JSONL data")
+    def is_indexable(self):
+        return True
+    def __len__(self):
+        return len(self.data)
+    def get_video_data(
+        self,
+        idx: int,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
+        data = self.data[idx]
+        video_path = data["video_path"]
+        video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
+        caption = data["caption"]
+        control = None
+        if "control_path" in data and data["control_path"]:
+            control_path = data["control_path"]
+            control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
+        return video_path, video, caption, control
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        data = self.data[idx]
+        video_path = data["video_path"]
+        caption = data["caption"]
+        return video_path, caption
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+    def __next__(self):
+        if self.current_idx >= len(self.data):
+            raise StopIteration
+        if self.caption_only:
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+            def create_fetcher(index):
+                return lambda: self.get_video_data(index)
+            fetcher = create_fetcher(self.current_idx)
+        self.current_idx += 1
+        return fetcher
+class BaseDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int] = (960, 544),
+        caption_extension: Optional[str] = None,
+        batch_size: int = 1,
+        num_repeats: int = 1,
+        enable_bucket: bool = False,
+        bucket_no_upscale: bool = False,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        self.resolution = resolution
+        self.caption_extension = caption_extension
+        self.batch_size = batch_size
+        self.num_repeats = num_repeats
+        self.enable_bucket = enable_bucket
+        self.bucket_no_upscale = bucket_no_upscale
+        self.cache_directory = cache_directory
+        self.debug_dataset = debug_dataset
+        self.architecture = architecture
+        self.seed = None
+        self.current_epoch = 0
+        if not self.enable_bucket:
+            self.bucket_no_upscale = False
+    def get_metadata(self) -> dict:
+        metadata = {
+            "resolution": self.resolution,
+            "caption_extension": self.caption_extension,
+            "batch_size_per_device": self.batch_size,
+            "num_repeats": self.num_repeats,
+            "enable_bucket": bool(self.enable_bucket),
+            "bucket_no_upscale": bool(self.bucket_no_upscale),
+        }
+        return metadata
+    def get_all_latent_cache_files(self):
+        return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+    def get_all_text_encoder_output_cache_files(self):
+        return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}_te.safetensors"))
+    def get_latent_cache_path(self, item_info: ItemInfo) -> str:
+        """
+        Returns the cache path for the latent tensor.
+        item_info: ItemInfo object
+        Returns:
+            str: cache path
+        cache_path is based on the item_key and the resolution.
+        """
+        w, h = item_info.original_size
+        basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
+        assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
+        return os.path.join(self.cache_directory, f"{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors")
+    def get_text_encoder_output_cache_path(self, item_info: ItemInfo) -> str:
+        basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
+        assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
+        return os.path.join(self.cache_directory, f"{basename}_{self.architecture}_te.safetensors")
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        raise NotImplementedError
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        raise NotImplementedError
+    def prepare_for_training(self):
+        pass
+    def set_seed(self, seed: int):
+        self.seed = seed
+    def set_current_epoch(self, epoch):
+        if not self.current_epoch == epoch:  # shuffle buckets when epoch is incremented
+            if epoch > self.current_epoch:
+                logger.info("epoch is incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                num_epochs = epoch - self.current_epoch
+                for _ in range(num_epochs):
+                    self.current_epoch += 1
+                    self.shuffle_buckets()
+                # self.current_epoch seem to be set to 0 again in the next epoch. it may be caused by skipped_dataloader?
+            else:
+                logger.warning("epoch is not incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                self.current_epoch = epoch
+    def set_current_step(self, step):
+        self.current_step = step
+    def set_max_train_steps(self, max_train_steps):
+        self.max_train_steps = max_train_steps
+    def shuffle_buckets(self):
+        raise NotImplementedError
+    def __len__(self):
+        return NotImplementedError
+    def __getitem__(self, idx):
+        raise NotImplementedError
+    def _default_retrieve_text_encoder_output_cache_batches(self, datasource: ContentDatasource, batch_size: int, num_workers: int):
+        datasource.set_caption_only(True)
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        data: list[ItemInfo] = []
+        futures = []
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+                for future in completed_futures:
+                    item_key, caption = future.result()
+                    item_info = ItemInfo(item_key, caption, (0, 0), (0, 0))
+                    item_info.text_encoder_output_cache_path = self.get_text_encoder_output_cache_path(item_info)
+                    data.append(item_info)
+                    futures.remove(future)
+        def submit_batch(flush: bool = False):
+            nonlocal data
+            if len(data) >= batch_size or (len(data) > 0 and flush):
+                batch = data[0:batch_size]
+                if len(data) > batch_size:
+                    data = data[batch_size:]
+                else:
+                    data = []
+                return batch
+            return None
+        for fetch_op in datasource:
+            future = executor.submit(fetch_op)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                batch = submit_batch()
+                if batch is None:
+                    break
+                yield batch
+        aggregate_future(consume_all=True)
+        while True:
+            batch = submit_batch(flush=True)
+            if batch is None:
+                break
+            yield batch
+        executor.shutdown()
+class ImageDataset(BaseDataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int],
+        caption_extension: Optional[str],
+        batch_size: int,
+        num_repeats: int,
+        enable_bucket: bool,
+        bucket_no_upscale: bool,
+        image_directory: Optional[str] = None,
+        image_jsonl_file: Optional[str] = None,
+        control_directory: Optional[str] = None,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        super(ImageDataset, self).__init__(
+            resolution,
+            caption_extension,
+            batch_size,
+            num_repeats,
+            enable_bucket,
+            bucket_no_upscale,
+            cache_directory,
+            debug_dataset,
+            architecture,
+        )
+        self.image_directory = image_directory
+        self.image_jsonl_file = image_jsonl_file
+        self.control_directory = control_directory
+        if image_directory is not None:
+            self.datasource = ImageDirectoryDatasource(image_directory, caption_extension, control_directory)
+        elif image_jsonl_file is not None:
+            self.datasource = ImageJsonlDatasource(image_jsonl_file)
+        else:
+            raise ValueError("image_directory or image_jsonl_file must be specified")
+        if self.cache_directory is None:
+            self.cache_directory = self.image_directory
+        self.batch_manager = None
+        self.num_train_items = 0
+        self.has_control = self.datasource.has_control
+    def get_metadata(self):
+        metadata = super().get_metadata()
+        if self.image_directory is not None:
+            metadata["image_directory"] = os.path.basename(self.image_directory)
+        if self.image_jsonl_file is not None:
+            metadata["image_jsonl_file"] = os.path.basename(self.image_jsonl_file)
+        if self.control_directory is not None:
+            metadata["control_directory"] = os.path.basename(self.control_directory)
+        metadata["has_control"] = self.has_control
+        return metadata
+    def get_total_image_count(self):
+        return len(self.datasource) if self.datasource.is_indexable() else None
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        buckset_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        batches: dict[tuple[int, int], list[ItemInfo]] = {}  # (width, height) -> [ItemInfo]
+        futures = []
+        # aggregate futures and sort by bucket resolution
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+                for future in completed_futures:
+                    original_size, item_key, image, caption, control = future.result()
+                    bucket_height, bucket_width = image.shape[:2]
+                    bucket_reso = (bucket_width, bucket_height)
+                    item_info = ItemInfo(item_key, caption, original_size, bucket_reso, content=image)
+                    item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+                    if control is not None:
+                        item_info.control_content = control
+                    if bucket_reso not in batches:
+                        batches[bucket_reso] = []
+                    batches[bucket_reso].append(item_info)
+                    futures.remove(future)
+        # submit batch if some bucket has enough items
+        def submit_batch(flush: bool = False):
+            for key in batches:
+                if len(batches[key]) >= self.batch_size or flush:
+                    batch = batches[key][0 : self.batch_size]
+                    if len(batches[key]) > self.batch_size:
+                        batches[key] = batches[key][self.batch_size :]
+                    else:
+                        del batches[key]
+                    return key, batch
+            return None, None
+        for fetch_op in self.datasource:
+            # fetch and resize image in a separate thread
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, Image.Image, str, Optional[Image.Image]]:
+                image_key, image, caption, control = op()
+                image: Image.Image
+                image_size = image.size
+                bucket_reso = buckset_selector.get_bucket_resolution(image_size)
+                image = resize_image_to_bucket(image, bucket_reso)
+                if control is not None:
+                    control = resize_image_to_bucket(control, bucket_reso)
+                return image_size, image_key, image, caption, control
+            future = executor.submit(fetch_and_resize, fetch_op)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                key, batch = submit_batch()
+                if key is None:
+                    break
+                yield key, batch
+        aggregate_future(consume_all=True)
+        while True:
+            key, batch = submit_batch(flush=True)
+            if key is None:
+                break
+            yield key, batch
+        executor.shutdown()
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
+    def prepare_for_training(self):
+        bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        # glob cache files
+        latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+        # assign cache files to item info
+        bucketed_item_info: dict[tuple[int, int], list[ItemInfo]] = {}  # (width, height) -> [ItemInfo]
+        for cache_file in latent_cache_files:
+            tokens = os.path.basename(cache_file).split("_")
+            image_size = tokens[-2]  # 0000x0000
+            image_width, image_height = map(int, image_size.split("x"))
+            image_size = (image_width, image_height)
+            item_key = "_".join(tokens[:-2])
+            text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
+            if not os.path.exists(text_encoder_output_cache_file):
+                logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
+                continue
+            bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            item_info = ItemInfo(item_key, "", image_size, bucket_reso, latent_cache_path=cache_file)
+            item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
+            bucket = bucketed_item_info.get(bucket_reso, [])
+            for _ in range(self.num_repeats):
+                bucket.append(item_info)
+            bucketed_item_info[bucket_reso] = bucket
+        # prepare batch manager
+        self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
+        self.batch_manager.show_bucket_info()
+        self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
+    def shuffle_buckets(self):
+        # set random seed for this epoch
+        random.seed(self.seed + self.current_epoch)
+        self.batch_manager.shuffle()
+    def __len__(self):
+        if self.batch_manager is None:
+            return 100  # dummy value
+        return len(self.batch_manager)
+    def __getitem__(self, idx):
+        return self.batch_manager[idx]
+class VideoDataset(BaseDataset):
+    TARGET_FPS_HUNYUAN = 24.0
+    TARGET_FPS_WAN = 16.0
+    TARGET_FPS_FRAMEPACK = 30.0
+    def __init__(
+        self,
+        resolution: Tuple[int, int],
+        caption_extension: Optional[str],
+        batch_size: int,
+        num_repeats: int,
+        enable_bucket: bool,
+        bucket_no_upscale: bool,
+        frame_extraction: Optional[str] = "head",
+        frame_stride: Optional[int] = 1,
+        frame_sample: Optional[int] = 1,
+        target_frames: Optional[list[int]] = None,
+        max_frames: Optional[int] = None,
+        source_fps: Optional[float] = None,
+        video_directory: Optional[str] = None,
+        video_jsonl_file: Optional[str] = None,
+        control_directory: Optional[str] = None,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        super(VideoDataset, self).__init__(
+            resolution,
+            caption_extension,
+            batch_size,
+            num_repeats,
+            enable_bucket,
+            bucket_no_upscale,
+            cache_directory,
+            debug_dataset,
+            architecture,
+        )
+        self.video_directory = video_directory
+        self.video_jsonl_file = video_jsonl_file
+        self.control_directory = control_directory
+        self.frame_extraction = frame_extraction
+        self.frame_stride = frame_stride
+        self.frame_sample = frame_sample
+        self.max_frames = max_frames
+        self.source_fps = source_fps
+        if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+            self.target_fps = VideoDataset.TARGET_FPS_HUNYUAN
+        elif self.architecture == ARCHITECTURE_WAN:
+            self.target_fps = VideoDataset.TARGET_FPS_WAN
+        elif self.architecture == ARCHITECTURE_FRAMEPACK:
+            self.target_fps = VideoDataset.TARGET_FPS_FRAMEPACK
+        else:
+            raise ValueError(f"Unsupported architecture: {self.architecture}")
+        if target_frames is not None:
+            target_frames = list(set(target_frames))
+            target_frames.sort()
+            # round each value to N*4+1
+            rounded_target_frames = [(f - 1) // 4 * 4 + 1 for f in target_frames]
+            rouneded_target_frames = list(set(rounded_target_frames))
+            rouneded_target_frames.sort()
+            # if value is changed, warn
+            if target_frames != rounded_target_frames:
+                logger.warning(f"target_frames are rounded to {rounded_target_frames}")
+            target_frames = tuple(rounded_target_frames)
+        self.target_frames = target_frames
+        if video_directory is not None:
+            self.datasource = VideoDirectoryDatasource(video_directory, caption_extension, control_directory)
+        elif video_jsonl_file is not None:
+            self.datasource = VideoJsonlDatasource(video_jsonl_file)
+        if self.frame_extraction == "uniform" and self.frame_sample == 1:
+            self.frame_extraction = "head"
+            logger.warning("frame_sample is set to 1 for frame_extraction=uniform. frame_extraction is changed to head.")
+        if self.frame_extraction == "head":
+            # head extraction. we can limit the number of frames to be extracted
+            self.datasource.set_start_and_end_frame(0, max(self.target_frames))
+        if self.cache_directory is None:
+            self.cache_directory = self.video_directory
+        self.batch_manager = None
+        self.num_train_items = 0
+        self.has_control = self.datasource.has_control
+    def get_metadata(self):
+        metadata = super().get_metadata()
+        if self.video_directory is not None:
+            metadata["video_directory"] = os.path.basename(self.video_directory)
+        if self.video_jsonl_file is not None:
+            metadata["video_jsonl_file"] = os.path.basename(self.video_jsonl_file)
+        if self.control_directory is not None:
+            metadata["control_directory"] = os.path.basename(self.control_directory)
+        metadata["frame_extraction"] = self.frame_extraction
+        metadata["frame_stride"] = self.frame_stride
+        metadata["frame_sample"] = self.frame_sample
+        metadata["target_frames"] = self.target_frames
+        metadata["max_frames"] = self.max_frames
+        metadata["source_fps"] = self.source_fps
+        metadata["has_control"] = self.has_control
+        return metadata
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        buckset_selector = BucketSelector(self.resolution, architecture=self.architecture)
+        self.datasource.set_bucket_selector(buckset_selector)
+        if self.source_fps is not None:
+            self.datasource.set_source_and_target_fps(self.source_fps, self.target_fps)
+        else:
+            self.datasource.set_source_and_target_fps(None, None)  # no conversion
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        # key: (width, height, frame_count), value: [ItemInfo]
+        batches: dict[tuple[int, int, int], list[ItemInfo]] = {}
+        futures = []
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+                for future in completed_futures:
+                    original_frame_size, video_key, video, caption, control = future.result()
+                    frame_count = len(video)
+                    video = np.stack(video, axis=0)
+                    height, width = video.shape[1:3]
+                    bucket_reso = (width, height)  # already resized
+                    # process control images if available
+                    control_video = None
+                    if control is not None:
+                        # set frame count to the same as video
+                        if len(control) > frame_count:
+                            control = control[:frame_count]
+                        elif len(control) < frame_count:
+                            # if control is shorter than video, repeat the last frame
+                            last_frame = control[-1]
+                            control.extend([last_frame] * (frame_count - len(control)))
+                        control_video = np.stack(control, axis=0)
+                    crop_pos_and_frames = []
+                    if self.frame_extraction == "head":
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                crop_pos_and_frames.append((0, target_frame))
+                    elif self.frame_extraction == "chunk":
+                        # split by target_frames
+                        for target_frame in self.target_frames:
+                            for i in range(0, frame_count, target_frame):
+                                if i + target_frame <= frame_count:
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "slide":
+                        # slide window
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                for i in range(0, frame_count - target_frame + 1, self.frame_stride):
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "uniform":
+                        # select N frames uniformly
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                frame_indices = np.linspace(0, frame_count - target_frame, self.frame_sample, dtype=int)
+                                for i in frame_indices:
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "full":
+                        # select all frames
+                        target_frame = min(frame_count, self.max_frames)
+                        target_frame = (target_frame - 1) // 4 * 4 + 1  # round to N*4+1
+                        crop_pos_and_frames.append((0, target_frame))
+                    else:
+                        raise ValueError(f"frame_extraction {self.frame_extraction} is not supported")
+                    for crop_pos, target_frame in crop_pos_and_frames:
+                        cropped_video = video[crop_pos : crop_pos + target_frame]
+                        body, ext = os.path.splitext(video_key)
+                        item_key = f"{body}_{crop_pos:05d}-{target_frame:03d}{ext}"
+                        batch_key = (*bucket_reso, target_frame)  # bucket_reso with frame_count
+                        # crop control video if available
+                        cropped_control = None
+                        if control_video is not None:
+                            cropped_control = control_video[crop_pos : crop_pos + target_frame]
+                        item_info = ItemInfo(
+                            item_key, caption, original_frame_size, batch_key, frame_count=target_frame, content=cropped_video
+                        )
+                        item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+                        item_info.control_content = cropped_control  # None is allowed
+                        batch = batches.get(batch_key, [])
+                        batch.append(item_info)
+                        batches[batch_key] = batch
+                    futures.remove(future)
+        def submit_batch(flush: bool = False):
+            for key in batches:
+                if len(batches[key]) >= self.batch_size or flush:
+                    batch = batches[key][0 : self.batch_size]
+                    if len(batches[key]) > self.batch_size:
+                        batches[key] = batches[key][self.batch_size :]
+                    else:
+                        del batches[key]
+                    return key, batch
+            return None, None
+        for operator in self.datasource:
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, list[np.ndarray], str, Optional[list[np.ndarray]]]:
+                result = op()
+                if len(result) == 3:  # for backward compatibility TODO remove this in the future
+                    video_key, video, caption = result
+                    control = None
+                else:
+                    video_key, video, caption, control = result
+                video: list[np.ndarray]
+                frame_size = (video[0].shape[1], video[0].shape[0])
+                # resize if necessary
+                bucket_reso = buckset_selector.get_bucket_resolution(frame_size)
+                video = [resize_image_to_bucket(frame, bucket_reso) for frame in video]
+                # resize control if necessary
+                if control is not None:
+                    control = [resize_image_to_bucket(frame, bucket_reso) for frame in control]
+                return frame_size, video_key, video, caption, control
+            future = executor.submit(fetch_and_resize, operator)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                key, batch = submit_batch()
+                if key is None:
+                    break
+                yield key, batch
+        aggregate_future(consume_all=True)
+        while True:
+            key, batch = submit_batch(flush=True)
+            if key is None:
+                break
+            yield key, batch
+        executor.shutdown()
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
+    def prepare_for_training(self):
+        bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        # glob cache files
+        latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+        # assign cache files to item info
+        bucketed_item_info: dict[tuple[int, int, int], list[ItemInfo]] = {}  # (width, height, frame_count) -> [ItemInfo]
+        for cache_file in latent_cache_files:
+            tokens = os.path.basename(cache_file).split("_")
+            image_size = tokens[-2]  # 0000x0000
+            image_width, image_height = map(int, image_size.split("x"))
+            image_size = (image_width, image_height)
+            frame_pos, frame_count = tokens[-3].split("-")[:2]  # "00000-000", or optional section index "00000-000-00"
+            frame_pos, frame_count = int(frame_pos), int(frame_count)
+            item_key = "_".join(tokens[:-3])
+            text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
+            if not os.path.exists(text_encoder_output_cache_file):
+                logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
+                continue
+            bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            bucket_reso = (*bucket_reso, frame_count)
+            item_info = ItemInfo(item_key, "", image_size, bucket_reso, frame_count=frame_count, latent_cache_path=cache_file)
+            item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
+            bucket = bucketed_item_info.get(bucket_reso, [])
+            for _ in range(self.num_repeats):
+                bucket.append(item_info)
+            bucketed_item_info[bucket_reso] = bucket
+        # prepare batch manager
+        self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
+        self.batch_manager.show_bucket_info()
+        self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
+    def shuffle_buckets(self):
+        # set random seed for this epoch
+        random.seed(self.seed + self.current_epoch)
+        self.batch_manager.shuffle()
+    def __len__(self):
+        if self.batch_manager is None:
+            return 100  # dummy value
+        return len(self.batch_manager)
+    def __getitem__(self, idx):
+        return self.batch_manager[idx]
+class DatasetGroup(torch.utils.data.ConcatDataset):
+    def __init__(self, datasets: Sequence[Union[ImageDataset, VideoDataset]]):
+        super().__init__(datasets)
+        self.datasets: list[Union[ImageDataset, VideoDataset]] = datasets
+        self.num_train_items = 0
+        for dataset in self.datasets:
+            self.num_train_items += dataset.num_train_items
+    def set_current_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_current_epoch(epoch)
+    def set_current_step(self, step):
+        for dataset in self.datasets:
+            dataset.set_current_step(step)
+    def set_max_train_steps(self, max_train_steps):
+        for dataset in self.datasets:
+            dataset.set_max_train_steps(max_train_steps)

fpack_cache_latents.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import argparse
+import logging
+import math
+import os
+from typing import List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache_framepack, ARCHITECTURE_FRAMEPACK
+from frame_pack import hunyuan
+from frame_pack.framepack_utils import load_image_encoders, load_vae
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from frame_pack.clip_vision import hf_clip_vision_encode
+import cache_latents
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    vae: AutoencoderKLCausal3D,
+    feature_extractor: SiglipImageProcessor,
+    image_encoder: SiglipVisionModel,
+    batch: List[ItemInfo],
+    latent_window_size: int,
+    vanilla_sampling: bool = False,
+    one_frame: bool = False,
+):
+    """Encode a batch of original RGB videos and save FramePack section caches."""
+    if one_frame:
+        encode_and_save_batch_one_frame(vae, feature_extractor, image_encoder, batch, latent_window_size, vanilla_sampling)
+        return
+    # Stack batch into tensor (B,C,F,H,W) in RGB order
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    height, width = contents.shape[3], contents.shape[4]
+    if height < 8 or width < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # calculate latent frame count from original frame count (4n+1)
+    latent_f = (batch[0].frame_count - 1) // 4 + 1
+    # calculate the total number of sections (excluding the first frame, divided by window size)
+    total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
+    if total_latent_sections < 1:
+        min_frames_needed = latent_window_size * 4 + 1
+        raise ValueError(
+            f"Not enough frames for FramePack: {batch[0].frame_count} frames ({latent_f} latent frames), minimum required: {min_frames_needed} frames ({latent_window_size+1} latent frames)"
+        )
+    # actual latent frame count (aligned to section boundaries)
+    latent_f_aligned = total_latent_sections * latent_window_size + 1 if not one_frame else 1
+    # actual video frame count
+    frame_count_aligned = (latent_f_aligned - 1) * 4 + 1
+    if frame_count_aligned != batch[0].frame_count:
+        logger.info(
+            f"Frame count mismatch: required={frame_count_aligned} != actual={batch[0].frame_count}, trimming to {frame_count_aligned}"
+        )
+        contents = contents[:, :, :frame_count_aligned, :, :]
+    latent_f = latent_f_aligned  # Update to the aligned value
+    # VAE encode (list of tensor -> stack)
+    latents = hunyuan.vae_encode(contents, vae)  # include scaling factor
+    latents = latents.to("cpu")  # (B, C, latent_f, H/8, W/8)
+    # Vision encoding per‑item (once)
+    images = np.stack([item.content[0] for item in batch], axis=0)  # B, H, W, C
+    # encode image with image encoder
+    image_embeddings = []
+    with torch.no_grad():
+        for image in images:
+            image_encoder_output = hf_clip_vision_encode(image, feature_extractor, image_encoder)
+            image_embeddings.append(image_encoder_output.last_hidden_state)
+    image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
+    image_embeddings = image_embeddings.to("cpu")  # Save memory
+    if not vanilla_sampling:
+        # padding is reversed for inference (future to past)
+        latent_paddings = list(reversed(range(total_latent_sections)))
+        # Note: The padding trick for inference. See the paper for details.
+        if total_latent_sections > 4:
+            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+        for b, item in enumerate(batch):
+            original_latent_cache_path = item.latent_cache_path
+            video_lat = latents[b : b + 1]  # keep batch dim, 1, C, F, H, W
+            # emulate inference step (history latents)
+            # Note: In inference, history_latents stores *generated* future latents.
+            # Here, for caching, we just need its shape and type for clean_* tensors.
+            # The actual content doesn't matter much as clean_* will be overwritten.
+            history_latents = torch.zeros(
+                (1, video_lat.shape[1], 1 + 2 + 16, video_lat.shape[3], video_lat.shape[4]), dtype=video_lat.dtype
+            )  # C=16 for HY
+            latent_f_index = latent_f - latent_window_size  # Start from the last section
+            section_index = total_latent_sections - 1
+            for latent_padding in latent_paddings:
+                is_last_section = section_index == 0  # the last section in inference order == the first section in time
+                latent_padding_size = latent_padding * latent_window_size
+                if is_last_section:
+                    assert latent_f_index == 1, "Last section should be starting from frame 1"
+                # indices generation (same as inference)
+                indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                (
+                    clean_latent_indices_pre,  # Index for start_latent
+                    blank_indices,  # Indices for padding (future context in inference)
+                    latent_indices,  # Indices for the target latents to predict
+                    clean_latent_indices_post,  # Index for the most recent history frame
+                    clean_latent_2x_indices,  # Indices for the next 2 history frames
+                    clean_latent_4x_indices,  # Indices for the next 16 history frames
+                ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                # Indices for clean_latents (start + recent history)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                # clean latents preparation (emulating inference)
+                clean_latents_pre = video_lat[:, :, 0:1, :, :]  # Always the first frame (start_latent)
+                clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                    [1, 2, 16], dim=2
+                )
+                clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)  # Combine start frame + placeholder
+                # Target latents for this section (ground truth)
+                target_latents = video_lat[:, :, latent_f_index : latent_f_index + latent_window_size, :, :]
+                # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
+                item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
+                save_latent_cache_framepack(
+                    item_info=item,
+                    latent=target_latents.squeeze(0),  # Ground truth for this section
+                    latent_indices=latent_indices.squeeze(0),  # Indices for the ground truth section
+                    clean_latents=clean_latents.squeeze(0),  # Start frame + history placeholder
+                    clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for start frame + history placeholder
+                    clean_latents_2x=clean_latents_2x.squeeze(0),  # History placeholder
+                    clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for history placeholder
+                    clean_latents_4x=clean_latents_4x.squeeze(0),  # History placeholder
+                    clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for history placeholder
+                    image_embeddings=image_embeddings[b],
+                )
+                if is_last_section:  # If this was the first section generated in inference (time=0)
+                    # History gets the start frame + the generated first section
+                    generated_latents_for_history = video_lat[:, :, : latent_window_size + 1, :, :]
+                else:
+                    # History gets the generated current section
+                    generated_latents_for_history = target_latents  # Use true latents as stand-in for generated
+                history_latents = torch.cat([generated_latents_for_history, history_latents], dim=2)
+                section_index -= 1
+                latent_f_index -= latent_window_size
+    else:
+        # Vanilla Sampling Logic
+        for b, item in enumerate(batch):
+            original_latent_cache_path = item.latent_cache_path
+            video_lat = latents[b : b + 1]  # Keep batch dim: 1, C, F_aligned, H, W
+            img_emb = image_embeddings[b]  # LEN, 1152
+            for section_index in range(total_latent_sections):
+                target_start_f = section_index * latent_window_size + 1
+                target_end_f = target_start_f + latent_window_size
+                target_latents = video_lat[:, :, target_start_f:target_end_f, :, :]
+                start_latent = video_lat[:, :, 0:1, :, :]
+                # Clean latents preparation (Vanilla)
+                clean_latents_total_count = 1 + 2 + 16
+                history_latents = torch.zeros(
+                    size=(1, 16, clean_latents_total_count, video_lat.shape[-2], video_lat.shape[-1]),
+                    device=video_lat.device,
+                    dtype=video_lat.dtype,
+                )
+                history_start_f = 0
+                video_start_f = target_start_f - clean_latents_total_count
+                copy_count = clean_latents_total_count
+                if video_start_f < 0:
+                    history_start_f = -video_start_f
+                    copy_count = clean_latents_total_count - history_start_f
+                    video_start_f = 0
+                if copy_count > 0:
+                    history_latents[:, :, history_start_f:] = video_lat[:, :, video_start_f : video_start_f + copy_count, :, :]
+                # indices generation (Vanilla): copy from FramePack-F1
+                indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+                (
+                    clean_latent_indices_start,
+                    clean_latent_4x_indices,
+                    clean_latent_2x_indices,
+                    clean_latent_1x_indices,
+                    latent_indices,
+                ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents.split([16, 2, 1], dim=2)
+                clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
+                # Save cache
+                item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
+                save_latent_cache_framepack(
+                    item_info=item,
+                    latent=target_latents.squeeze(0),
+                    latent_indices=latent_indices.squeeze(0),  # Indices for target section i
+                    clean_latents=clean_latents.squeeze(0),  # Past clean frames
+                    clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for clean_latents_pre/post
+                    clean_latents_2x=clean_latents_2x.squeeze(0),  # Past clean frames (2x)
+                    clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for clean_latents_2x
+                    clean_latents_4x=clean_latents_4x.squeeze(0),  # Past clean frames (4x)
+                    clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for clean_latents_4x
+                    image_embeddings=img_emb,
+                    # Note: We don't explicitly save past_offset_indices,
+                    # but its size influences the absolute values in other indices.
+                )
+def encode_and_save_batch_one_frame(
+    vae: AutoencoderKLCausal3D,
+    feature_extractor: SiglipImageProcessor,
+    image_encoder: SiglipVisionModel,
+    batch: List[ItemInfo],
+    latent_window_size: int,
+    vanilla_sampling: bool = False,
+):
+    # item.content: target image (H, W, C)
+    # item.control_content: start image (H, W, C)
+    # Stack batch into tensor (B,F,H,W,C) in RGB order.
+    contents = torch.stack(
+        [torch.stack([torch.from_numpy(item.control_content), torch.from_numpy(item.content)]) for item in batch]
+    )
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    height, width = contents.shape[3], contents.shape[4]
+    if height < 8 or width < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # VAE encode (list of tensor -> stack)
+    start_latents = hunyuan.vae_encode(contents[:, :, 0:1], vae)  # include scaling factor
+    start_latents = start_latents.to("cpu")  # (B, C, 1, H/8, W/8)
+    latents = hunyuan.vae_encode(contents[:, :, 1:], vae)  # include scaling factor
+    latents = latents.to("cpu")  # (B, C, 1, H/8, W/8)
+    # Vision encoding per‑item (once): use control content because it is the start image
+    images = [item.control_content for item in batch]  # list of [H, W, C]
+    # encode image with image encoder
+    image_embeddings = []
+    with torch.no_grad():
+        for image in images:
+            image_encoder_output = hf_clip_vision_encode(image, feature_extractor, image_encoder)
+            image_embeddings.append(image_encoder_output.last_hidden_state)
+    image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
+    image_embeddings = image_embeddings.to("cpu")  # Save memory
+    # history latents is always zeroes for one frame training
+    history_latents = torch.zeros(
+        (1, latents.shape[1], 1 + 2 + 16, latents.shape[3], latents.shape[4]), dtype=latents.dtype
+    )  # C=16 for HY
+    # indices generation (same as inference)
+    indices = torch.arange(0, sum([1, latent_window_size, 1, 2, 16])).unsqueeze(0)
+    (
+        clean_latent_indices_pre,  # Index for start_latent
+        latent_indices,  # Indices for the target latents to predict
+        clean_latent_indices_post,  # Index for the most recent history frame
+        clean_latent_2x_indices,  # Indices for the next 2 history frames
+        clean_latent_4x_indices,  # Indices for the next 16 history frames
+    ) = indices.split([1, latent_window_size, 1, 2, 16], dim=1)
+    # Indices for clean_latents (start + recent history)
+    latent_indices = latent_indices[:, -1:]  # Only the last index is used for one frame training
+    clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+    # clean latents preparation for all items (emulating inference)
+    clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
+    for b, item in enumerate(batch):
+        original_latent_cache_path = item.latent_cache_path
+        # clean latents preparation (emulating inference)
+        clean_latents_pre = start_latents[b : b + 1]
+        clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)  # Combine start frame + placeholder
+        # Target latents for this section (ground truth)
+        target_latents = latents[b : b + 1]
+        # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
+        save_latent_cache_framepack(
+            item_info=item,
+            latent=target_latents.squeeze(0),  # Ground truth for this section
+            latent_indices=latent_indices.squeeze(0),  # Indices for the ground truth section
+            clean_latents=clean_latents.squeeze(0),  # Start frame + history placeholder
+            clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for start frame + history placeholder
+            clean_latents_2x=clean_latents_2x.squeeze(0),  # History placeholder
+            clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for history placeholder
+            clean_latents_4x=clean_latents_4x.squeeze(0),  # History placeholder
+            clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for history placeholder
+            image_embeddings=image_embeddings[b],
+        )
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
+    parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
+    parser.add_argument(
+        "--f1",
+        action="store_true",
+        help="Generate cache for F1 model (vanilla (autoregressive) sampling) instead of Inverted anti-drifting (plain FramePack)",
+    )
+    parser.add_argument(
+        "--one_frame",
+        action="store_true",
+        help="Generate cache for one frame training (single frame, single section). latent_window_size is used as the index of the target frame.",
+    )
+    return parser
+def main(args: argparse.Namespace):
+    device = args.device if hasattr(args, "device") and args.device else ("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    if args.debug_mode is not None:
+        cache_latents.show_datasets(
+            datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images, fps=16
+        )
+        return
+    assert args.vae is not None, "vae checkpoint is required"
+    logger.info(f"Loading VAE model from {args.vae}")
+    vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device=device)
+    vae.to(device)
+    logger.info(f"Loading image encoder from {args.image_encoder}")
+    feature_extractor, image_encoder = load_image_encoders(args)
+    image_encoder.eval()
+    image_encoder.to(device)
+    logger.info(f"Cache generation mode: {'Vanilla Sampling' if args.f1 else 'Inference Emulation'}")
+    # encoding closure
+    def encode(batch: List[ItemInfo]):
+        encode_and_save_batch(vae, feature_extractor, image_encoder, batch, args.latent_window_size, args.f1, args.one_frame)
+    # reuse core loop from cache_latents with no change
+    encode_datasets_framepack(datasets, encode, args)
+def append_section_idx_to_latent_cache_path(latent_cache_path: str, section_idx: int) -> str:
+    tokens = latent_cache_path.split("_")
+    tokens[-3] = f"{tokens[-3]}-{section_idx:04d}"  # append section index to "frame_pos-count"
+    return "_".join(tokens)
+def encode_datasets_framepack(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
+    num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_latent_cache_paths = []
+        for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
+            batch: list[ItemInfo] = batch  # type: ignore
+            # latent_cache_path is "{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
+            # For video dataset,we expand it to "{basename}_{section_idx:04d}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
+            filtered_batch = []
+            for item in batch:
+                if item.frame_count is None:
+                    # image dataset
+                    all_latent_cache_paths.append(item.latent_cache_path)
+                    all_existing = os.path.exists(item.latent_cache_path)
+                else:
+                    latent_f = (item.frame_count - 1) // 4 + 1
+                    num_sections = max(1, math.floor((latent_f - 1) / args.latent_window_size))  # min 1 section
+                    all_existing = True
+                    for sec in range(num_sections):
+                        p = append_section_idx_to_latent_cache_path(item.latent_cache_path, sec)
+                        all_latent_cache_paths.append(p)
+                        all_existing = all_existing and os.path.exists(p)
+                if not all_existing:  # if any section cache is missing
+                    filtered_batch.append(item)
+            if args.skip_existing:
+                if len(filtered_batch) == 0:  # all sections exist
+                    logger.info(f"All sections exist for {batch[0].item_key}, skipping")
+                    continue
+                batch = filtered_batch  # update batch to only missing sections
+            bs = args.batch_size if args.batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+        # normalize paths
+        all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
+        all_latent_cache_paths = set(all_latent_cache_paths)
+        # remove old cache files not in the dataset
+        all_cache_files = dataset.get_all_latent_cache_files()
+        for cache_file in all_cache_files:
+            if os.path.normpath(cache_file) not in all_latent_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+if __name__ == "__main__":
+    parser = cache_latents.setup_parser_common()
+    parser = cache_latents.hv_setup_parser(parser)  # VAE
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    if args.vae_dtype is not None:
+        raise ValueError("VAE dtype is not supported in FramePack")
+    # if args.batch_size != 1:
+    #     args.batch_size = 1
+    #     logger.info("Batch size is set to 1 for FramePack.")
+    main(args)

fpack_cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import argparse
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import LlamaTokenizerFast, LlamaModel, CLIPTokenizer, CLIPTextModel
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ItemInfo, save_text_encoder_output_cache_framepack
+import cache_text_encoder_outputs
+from frame_pack import hunyuan
+from frame_pack.framepack_utils import load_text_encoder1, load_text_encoder2
+import logging
+from frame_pack.utils import crop_or_pad_yield_mask
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    tokenizer1: LlamaTokenizerFast,
+    text_encoder1: LlamaModel,
+    tokenizer2: CLIPTokenizer,
+    text_encoder2: CLIPTextModel,
+    batch: list[ItemInfo],
+    device: torch.device,
+):
+    prompts = [item.caption for item in batch]
+    # encode prompt
+    # FramePack's encode_prompt_conds only supports single prompt, so we need to encode each prompt separately
+    list_of_llama_vec = []
+    list_of_llama_attention_mask = []
+    list_of_clip_l_pooler = []
+    for prompt in prompts:
+        with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+            # llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompts, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+            llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+            llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+        list_of_llama_vec.append(llama_vec.squeeze(0))
+        list_of_llama_attention_mask.append(llama_attention_mask.squeeze(0))
+        list_of_clip_l_pooler.append(clip_l_pooler.squeeze(0))
+    # save prompt cache
+    for item, llama_vec, llama_attention_mask, clip_l_pooler in zip(
+        batch, list_of_llama_vec, list_of_llama_attention_mask, list_of_clip_l_pooler
+    ):
+        # save llama_vec and clip_l_pooler to cache
+        save_text_encoder_output_cache_framepack(item, llama_vec, llama_attention_mask, clip_l_pooler)
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = cache_text_encoder_outputs.prepare_cache_files_and_paths(datasets)
+    # load text encoder
+    tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+    tokenizer2, text_encoder2 = load_text_encoder2(args)
+    text_encoder2.to(device)
+    # Encode with Text Encoders
+    logger.info("Encoding with Text Encoders")
+    def encode_for_text_encoder(batch: list[ItemInfo]):
+        encode_and_save_batch(tokenizer1, text_encoder1, tokenizer2, text_encoder2, batch, device)
+    cache_text_encoder_outputs.process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder,
+    )
+    # remove cache files not in dataset
+    cache_text_encoder_outputs.post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset, args.keep_cache)
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    return parser
+if __name__ == "__main__":
+    parser = cache_text_encoder_outputs.setup_parser_common()
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

fpack_generate_video.py ADDED Viewed

	@@ -0,0 +1,1711 @@

+import argparse
+from datetime import datetime
+import gc
+import json
+import random
+import os
+import re
+import time
+import math
+import copy
+from typing import Tuple, Optional, List, Union, Any, Dict
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import numpy as np
+import torchvision.transforms.functional as TF
+from transformers import LlamaModel
+from tqdm import tqdm
+from networks import lora_framepack
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from frame_pack import hunyuan
+from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
+from frame_pack.utils import crop_or_pad_yield_mask, resize_and_center_crop, soft_append_bcthw
+from frame_pack.bucket_tools import find_nearest_bucket
+from frame_pack.clip_vision import hf_clip_vision_encode
+from frame_pack.k_diffusion_hunyuan import sample_hunyuan
+from dataset import image_video_dataset
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+from utils.device_utils import clean_memory_on_device
+from hv_generate_video import save_images_grid, save_videos_grid, synchronize_device
+from wan_generate_video import merge_lora_weights
+from frame_pack.framepack_utils import load_vae, load_text_encoder1, load_text_encoder2, load_image_encoders
+from dataset.image_video_dataset import load_video
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class GenerationSettings:
+    def __init__(self, device: torch.device, dit_weight_dtype: Optional[torch.dtype] = None):
+        self.device = device
+        self.dit_weight_dtype = dit_weight_dtype  # not used currently because model may be optimized
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Wan 2.1 inference script")
+    # WAN arguments
+    # parser.add_argument("--ckpt_dir", type=str, default=None, help="The path to the checkpoint directory (Wan 2.1 official).")
+    parser.add_argument(
+        "--sample_solver", type=str, default="unipc", choices=["unipc", "dpm++", "vanilla"], help="The solver used to sample."
+    )
+    parser.add_argument("--dit", type=str, default=None, help="DiT directory or path")
+    parser.add_argument("--vae", type=str, default=None, help="VAE directory or path")
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory or path")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory or path")
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image Encoder directory or path")
+    parser.add_argument("--f1", action="store_true", help="Use F1 sampling method")
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument("--include_patterns", type=str, nargs="*", default=None, help="LoRA module include patterns")
+    parser.add_argument("--exclude_patterns", type=str, nargs="*", default=None, help="LoRA module exclude patterns")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+    # inference
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="prompt for generation. If `;;;` is used, it will be split into sections. Example: `section_index:prompt` or "
+        "`section_index:prompt;;;section_index:prompt;;;...`, section_index can be `0` or `-1` or `0-2`, `-1` means last section, `0-2` means from 0 to 2 (inclusive).",
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default=None,
+        help="negative prompt for generation, default is empty string. should not change.",
+    )
+    parser.add_argument(
+        "--custom_system_prompt",
+        type=str,
+        default=None,
+        help="Custom system prompt for LLM. If specified, it will override the default system prompt. See hunyuan_model/text_encoder.py for the default system prompt.",
+    )
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size, height and width")
+    parser.add_argument("--video_seconds", type=float, default=5.0, help="video length, default is 5.0 seconds")
+    parser.add_argument(
+        "--video_sections",
+        type=int,
+        default=None,
+        help="number of video sections, Default is None (auto calculate from video seconds)",
+    )
+    parser.add_argument(
+        "--one_frame_inference",
+        type=str,
+        default=None,
+        help="one frame inference, default is None, comma separated values from 'zero_post', 'no_2x', 'no_4x' and 'no_post'.",
+    )
+    parser.add_argument(
+        "--image_mask_path",
+        type=str,
+        default=None,
+        help="path to image mask for one frame inference. If specified, it will be used as mask for input image.",
+    )
+    parser.add_argument(
+        "--end_image_mask_path",
+        type=str,
+        default=None,
+        nargs="*",
+        help="path to end (reference) image mask for one frame inference. If specified, it will be used as mask for end image.",
+    )
+    parser.add_argument("--fps", type=int, default=30, help="video fps, default is 30")
+    parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, default is 25")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    # parser.add_argument(
+    #     "--cpu_noise", action="store_true", help="Use CPU to generate noise (compatible with ComfyUI). Default is False."
+    # )
+    parser.add_argument("--latent_window_size", type=int, default=9, help="latent window size, default is 9. should not change.")
+    parser.add_argument(
+        "--embedded_cfg_scale", type=float, default=10.0, help="Embeded CFG scale (distilled CFG Scale), default is 10.0"
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="Guidance scale for classifier free guidance. Default is 1.0 (no guidance), should not change.",
+    )
+    parser.add_argument("--guidance_rescale", type=float, default=0.0, help="CFG Re-scale, default is 0.0. Should not change.")
+    # parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default=None,
+        help="path to image for image2video inference. If `;;;` is used, it will be used as section images. The notation is same as `--prompt`.",
+    )
+    parser.add_argument("--end_image_path", type=str, nargs="*", default=None, help="path to end image for image2video inference")
+    parser.add_argument(
+        "--latent_paddings",
+        type=str,
+        default=None,
+        help="latent paddings for each section, comma separated values. default is None (FramePack default paddings)",
+    )
+    # parser.add_argument(
+    #     "--control_path",
+    #     type=str,
+    #     default=None,
+    #     help="path to control video for inference with controlnet. video file or directory with images",
+    # )
+    # parser.add_argument("--trim_tail_frames", type=int, default=0, help="trim tail N frames from the video before saving")
+    # # Flow Matching
+    # parser.add_argument(
+    #     "--flow_shift",
+    #     type=float,
+    #     default=None,
+    #     help="Shift factor for flow matching schedulers. Default depends on task.",
+    # )
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
+    # parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode",
+        type=str,
+        default="torch",
+        choices=["flash", "torch", "sageattn", "xformers", "sdpa"],  #  "flash2", "flash3",
+        help="attention mode",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once")
+    parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        default="video",
+        choices=["video", "images", "latent", "both", "latent_images"],
+        help="output type",
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    # parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    # parser.add_argument(
+    #     "--compile_args",
+    #     nargs=4,
+    #     metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+    #     default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+    #     help="Torch.compile settings",
+    # )
+    # New arguments for batch and interactive modes
+    parser.add_argument("--from_file", type=str, default=None, help="Read prompts from a file")
+    parser.add_argument("--interactive", action="store_true", help="Interactive mode: read prompts from console")
+    args = parser.parse_args()
+    # Validate arguments
+    if args.from_file and args.interactive:
+        raise ValueError("Cannot use both --from_file and --interactive at the same time")
+    if args.latent_path is None or len(args.latent_path) == 0:
+        if args.prompt is None and not args.from_file and not args.interactive:
+            raise ValueError("Either --prompt, --from_file or --interactive must be specified")
+    return args
+def parse_prompt_line(line: str) -> Dict[str, Any]:
+    """Parse a prompt line into a dictionary of argument overrides
+    Args:
+        line: Prompt line with options
+    Returns:
+        Dict[str, Any]: Dictionary of argument overrides
+    """
+    # TODO common function with hv_train_network.line_to_prompt_dict
+    parts = line.split(" --")
+    prompt = parts[0].strip()
+    # Create dictionary of overrides
+    overrides = {"prompt": prompt}
+    # Initialize end_image_path and end_image_mask_path as a list to accommodate multiple paths
+    overrides["end_image_path"] = []
+    overrides["end_image_mask_path"] = []
+    for part in parts[1:]:
+        if not part.strip():
+            continue
+        option_parts = part.split(" ", 1)
+        option = option_parts[0].strip()
+        value = option_parts[1].strip() if len(option_parts) > 1 else ""
+        # Map options to argument names
+        if option == "w":
+            overrides["video_size_width"] = int(value)
+        elif option == "h":
+            overrides["video_size_height"] = int(value)
+        elif option == "f":
+            overrides["video_seconds"] = float(value)
+        elif option == "d":
+            overrides["seed"] = int(value)
+        elif option == "s":
+            overrides["infer_steps"] = int(value)
+        elif option == "g" or option == "l":
+            overrides["guidance_scale"] = float(value)
+        # elif option == "fs":
+        #     overrides["flow_shift"] = float(value)
+        elif option == "i":
+            overrides["image_path"] = value
+        elif option == "im":
+            overrides["image_mask_path"] = value
+        # elif option == "cn":
+        #     overrides["control_path"] = value
+        elif option == "n":
+            overrides["negative_prompt"] = value
+        elif option == "vs":  # video_sections
+            overrides["video_sections"] = int(value)
+        elif option == "ei":  # end_image_path
+            overrides["end_image_path"].append(value)
+        elif option == "eim":  # end_image_mask_path
+            overrides["end_image_mask_path"].append(value)
+        elif option == "of":  # one_frame_inference
+            overrides["one_frame_inference"] = value
+    # If no end_image_path was provided, remove the empty list
+    if not overrides["end_image_path"]:
+        del overrides["end_image_path"]
+    if not overrides["end_image_mask_path"]:
+        del overrides["end_image_mask_path"]
+    return overrides
+def apply_overrides(args: argparse.Namespace, overrides: Dict[str, Any]) -> argparse.Namespace:
+    """Apply overrides to args
+    Args:
+        args: Original arguments
+        overrides: Dictionary of overrides
+    Returns:
+        argparse.Namespace: New arguments with overrides applied
+    """
+    args_copy = copy.deepcopy(args)
+    for key, value in overrides.items():
+        if key == "video_size_width":
+            args_copy.video_size[1] = value
+        elif key == "video_size_height":
+            args_copy.video_size[0] = value
+        else:
+            setattr(args_copy, key, value)
+    return args_copy
+def check_inputs(args: argparse.Namespace) -> Tuple[int, int, int]:
+    """Validate video size and length
+    Args:
+        args: command line arguments
+    Returns:
+        Tuple[int, int, float]: (height, width, video_seconds)
+    """
+    height = args.video_size[0]
+    width = args.video_size[1]
+    video_seconds = args.video_seconds
+    if args.video_sections is not None:
+        video_seconds = (args.video_sections * (args.latent_window_size * 4) + 1) / args.fps
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    return height, width, video_seconds
+# region DiT model
+def load_dit_model(args: argparse.Namespace, device: torch.device) -> HunyuanVideoTransformer3DModelPacked:
+    """load DiT model
+    Args:
+        args: command line arguments
+        device: device to use
+        dit_dtype: data type for the model
+        dit_weight_dtype: data type for the model weights. None for as-is
+    Returns:
+        HunyuanVideoTransformer3DModelPacked: DiT model
+    """
+    loading_device = "cpu"
+    if args.blocks_to_swap == 0 and not args.fp8_scaled and args.lora_weight is None:
+        loading_device = device
+    # do not fp8 optimize because we will merge LoRA weights
+    model = load_packed_model(device, args.dit, args.attn_mode, loading_device)
+    return model
+def optimize_model(model: HunyuanVideoTransformer3DModelPacked, args: argparse.Namespace, device: torch.device) -> None:
+    """optimize the model (FP8 conversion, device move etc.)
+    Args:
+        model: dit model
+        args: command line arguments
+        device: device to use
+    """
+    if args.fp8_scaled:
+        # load state dict as-is and optimize to fp8
+        state_dict = model.state_dict()
+        # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
+        move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
+        state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=False)  # args.fp8_fast)
+        info = model.load_state_dict(state_dict, strict=True, assign=True)
+        logger.info(f"Loaded FP8 optimized weights: {info}")
+        if args.blocks_to_swap == 0:
+            model.to(device)  # make sure all parameters are on the right device (e.g. RoPE etc.)
+    else:
+        # simple cast to dit_dtype
+        target_dtype = None  # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
+        target_device = None
+        if args.fp8:
+            target_dtype = torch.float8e4m3fn
+        if args.blocks_to_swap == 0:
+            logger.info(f"Move model to device: {device}")
+            target_device = device
+        if target_device is not None and target_dtype is not None:
+            model.to(target_device, target_dtype)  # move and cast  at the same time. this reduces redundant copy operations
+    # if args.compile:
+    #     compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+    #     logger.info(
+    #         f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+    #     )
+    #     torch._dynamo.config.cache_size_limit = 32
+    #     for i in range(len(model.blocks)):
+    #         model.blocks[i] = torch.compile(
+    #             model.blocks[i],
+    #             backend=compile_backend,
+    #             mode=compile_mode,
+    #             dynamic=compile_dynamic.lower() in "true",
+    #             fullgraph=compile_fullgraph.lower() in "true",
+    #         )
+    if args.blocks_to_swap > 0:
+        logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
+        model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    else:
+        # make sure the model is on the right device
+        model.to(device)
+    model.eval().requires_grad_(False)
+    clean_memory_on_device(device)
+# endregion
+def decode_latent(
+    latent_window_size: int,
+    total_latent_sections: int,
+    bulk_decode: bool,
+    vae: AutoencoderKLCausal3D,
+    latent: torch.Tensor,
+    device: torch.device,
+    one_frame_inference_mode: bool = False,
+) -> torch.Tensor:
+    logger.info(f"Decoding video...")
+    if latent.ndim == 4:
+        latent = latent.unsqueeze(0)  # add batch dimension
+    vae.to(device)
+    if not bulk_decode and not one_frame_inference_mode:
+        latent_window_size = latent_window_size  # default is 9
+        # total_latent_sections = (args.video_seconds * 30) / (latent_window_size * 4)
+        # total_latent_sections = int(max(round(total_latent_sections), 1))
+        num_frames = latent_window_size * 4 - 3
+        latents_to_decode = []
+        latent_frame_index = 0
+        for i in range(total_latent_sections - 1, -1, -1):
+            is_last_section = i == total_latent_sections - 1
+            generated_latent_frames = (num_frames + 3) // 4 + (1 if is_last_section else 0)
+            section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+            section_latent = latent[:, :, latent_frame_index : latent_frame_index + section_latent_frames, :, :]
+            if section_latent.shape[2] > 0:
+                latents_to_decode.append(section_latent)
+            latent_frame_index += generated_latent_frames
+        latents_to_decode = latents_to_decode[::-1]  # reverse the order of latents to decode
+        history_pixels = None
+        for latent in tqdm(latents_to_decode):
+            if history_pixels is None:
+                history_pixels = hunyuan.vae_decode(latent, vae).cpu()
+            else:
+                overlapped_frames = latent_window_size * 4 - 3
+                current_pixels = hunyuan.vae_decode(latent, vae).cpu()
+                history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+            clean_memory_on_device(device)
+    else:
+        # bulk decode
+        logger.info(f"Bulk decoding or one frame inference")
+        if not one_frame_inference_mode:
+            history_pixels = hunyuan.vae_decode(latent, vae).cpu()  # normal
+        else:
+            # one frame inference
+            history_pixels = [hunyuan.vae_decode(latent[:, :, i : i + 1, :, :], vae).cpu() for i in range(latent.shape[2])]
+            history_pixels = torch.cat(history_pixels, dim=2)
+    vae.to("cpu")
+    logger.info(f"Decoded. Pixel shape {history_pixels.shape}")
+    return history_pixels[0]  # remove batch dimension
+def prepare_i2v_inputs(
+    args: argparse.Namespace,
+    device: torch.device,
+    vae: AutoencoderKLCausal3D,
+    shared_models: Optional[Dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for I2V
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        vae: VAE model, used for image encoding
+        shared_models: dictionary containing pre-loaded models
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, y, (arg_c, arg_null))
+    """
+    height, width, video_seconds = check_inputs(args)
+    # define parsing function
+    def parse_section_strings(input_string: str) -> dict[int, str]:
+        section_strings = {}
+        if ";;;" in input_string:
+            split_section_strings = input_string.split(";;;")
+            for section_str in split_section_strings:
+                if ":" not in section_str:
+                    start = end = 0
+                    section_str = section_str.strip()
+                else:
+                    index_str, section_str = section_str.split(":", 1)
+                    index_str = index_str.strip()
+                    section_str = section_str.strip()
+                    m = re.match(r"^(-?\d+)(-\d+)?$", index_str)
+                    if m:
+                        start = int(m.group(1))
+                        end = int(m.group(2)[1:]) if m.group(2) is not None else start
+                    else:
+                        start = end = 0
+                        section_str = section_str.strip()
+                for i in range(start, end + 1):
+                    section_strings[i] = section_str
+        else:
+            section_strings[0] = input_string
+        # assert 0 in section_prompts, "Section prompts must contain section 0"
+        if 0 not in section_strings:
+            # use smallest section index. prefer positive index over negative index
+            # if all section indices are negative, use the smallest negative index
+            indices = list(section_strings.keys())
+            if all(i < 0 for i in indices):
+                section_index = min(indices)
+            else:
+                section_index = min(i for i in indices if i >= 0)
+            section_strings[0] = section_strings[section_index]
+        return section_strings
+    # prepare image
+    def preprocess_image(image_path: str):
+        image = Image.open(image_path).convert("RGB")
+        image_np = np.array(image)  # PIL to numpy, HWC
+        image_np = image_video_dataset.resize_image_to_bucket(image_np, (width, height))
+        image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0  # -1 to 1.0, HWC
+        image_tensor = image_tensor.permute(2, 0, 1)[None, :, None]  # HWC -> CHW -> NCFHW, N=1, C=3, F=1
+        return image_tensor, image_np
+    section_image_paths = parse_section_strings(args.image_path)
+    section_images = {}
+    for index, image_path in section_image_paths.items():
+        img_tensor, img_np = preprocess_image(image_path)
+        section_images[index] = (img_tensor, img_np)
+    # check end images
+    if args.end_image_path is not None and len(args.end_image_path) > 0:
+        end_image_tensors = []
+        for end_img_path in args.end_image_path:
+            end_image_tensor, _ = preprocess_image(end_img_path)
+            end_image_tensors.append(end_image_tensor)
+    else:
+        end_image_tensors = None
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else ""
+    # parse section prompts
+    section_prompts = parse_section_strings(args.prompt)
+    # load text encoder
+    if shared_models is not None:
+        tokenizer1, text_encoder1 = shared_models["tokenizer1"], shared_models["text_encoder1"]
+        tokenizer2, text_encoder2 = shared_models["tokenizer2"], shared_models["text_encoder2"]
+        text_encoder1.to(device)
+    else:
+        tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+        tokenizer2, text_encoder2 = load_text_encoder2(args)
+    text_encoder2.to(device)
+    logger.info(f"Encoding prompt")
+    llama_vecs = {}
+    llama_attention_masks = {}
+    clip_l_poolers = {}
+    with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+        for index, prompt in section_prompts.items():
+            llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(
+                prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2, custom_system_prompt=args.custom_system_prompt
+            )
+            llama_vec = llama_vec.cpu()
+            clip_l_pooler = clip_l_pooler.cpu()
+            llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+            llama_vecs[index] = llama_vec
+            llama_attention_masks[index] = llama_attention_mask
+            clip_l_poolers[index] = clip_l_pooler
+    if args.guidance_scale == 1.0:
+        llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vecs[0]), torch.zeros_like(clip_l_poolers[0])
+    else:
+        with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+            llama_vec_n, clip_l_pooler_n = hunyuan.encode_prompt_conds(
+                n_prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2, custom_system_prompt=args.custom_system_prompt
+            )
+            llama_vec_n = llama_vec_n.cpu()
+            clip_l_pooler_n = clip_l_pooler_n.cpu()
+    llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+    # free text encoder and clean memory
+    if shared_models is not None:  # if shared models are used, do not free them but move to CPU
+        text_encoder1.to("cpu")
+        text_encoder2.to("cpu")
+    del tokenizer1, text_encoder1, tokenizer2, text_encoder2  # do not free shared models
+    clean_memory_on_device(device)
+    # load image encoder
+    if shared_models is not None:
+        feature_extractor, image_encoder = shared_models["feature_extractor"], shared_models["image_encoder"]
+    else:
+        feature_extractor, image_encoder = load_image_encoders(args)
+    image_encoder.to(device)
+    # encode image with image encoder
+    section_image_encoder_last_hidden_states = {}
+    for index, (img_tensor, img_np) in section_images.items():
+        with torch.no_grad():
+            image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
+        image_encoder_last_hidden_state = image_encoder_output.last_hidden_state.cpu()
+        section_image_encoder_last_hidden_states[index] = image_encoder_last_hidden_state
+    # free image encoder and clean memory
+    if shared_models is not None:
+        image_encoder.to("cpu")
+    del image_encoder, feature_extractor
+    clean_memory_on_device(device)
+    # VAE encoding
+    logger.info(f"Encoding image to latent space")
+    vae.to(device)
+    section_start_latents = {}
+    for index, (img_tensor, img_np) in section_images.items():
+        start_latent = hunyuan.vae_encode(img_tensor, vae).cpu()
+        section_start_latents[index] = start_latent
+    # end_latent = hunyuan.vae_encode(end_image_tensor, vae).cpu() if end_image_tensor is not None else None
+    if end_image_tensors is not None:
+        end_latents = []
+        for end_image_tensor in end_image_tensors:
+            end_latent = hunyuan.vae_encode(end_image_tensor, vae).cpu()
+            end_latents.append(end_latent)
+    else:
+        end_latents = None
+    vae.to("cpu")  # move VAE to CPU to save memory
+    clean_memory_on_device(device)
+    # prepare model input arguments
+    arg_c = {}
+    arg_null = {}
+    for index in llama_vecs.keys():
+        llama_vec = llama_vecs[index]
+        llama_attention_mask = llama_attention_masks[index]
+        clip_l_pooler = clip_l_poolers[index]
+        arg_c_i = {
+            "llama_vec": llama_vec,
+            "llama_attention_mask": llama_attention_mask,
+            "clip_l_pooler": clip_l_pooler,
+            "prompt": section_prompts[index],  # for debugging
+        }
+        arg_c[index] = arg_c_i
+    arg_null = {
+        "llama_vec": llama_vec_n,
+        "llama_attention_mask": llama_attention_mask_n,
+        "clip_l_pooler": clip_l_pooler_n,
+    }
+    arg_c_img = {}
+    for index in section_images.keys():
+        image_encoder_last_hidden_state = section_image_encoder_last_hidden_states[index]
+        start_latent = section_start_latents[index]
+        arg_c_img_i = {
+            "image_encoder_last_hidden_state": image_encoder_last_hidden_state,
+            "start_latent": start_latent,
+            "image_path": section_image_paths[index],
+        }
+        arg_c_img[index] = arg_c_img_i
+    return height, width, video_seconds, arg_c, arg_null, arg_c_img, end_latents
+# def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
+#     """setup scheduler for sampling
+#     Args:
+#         args: command line arguments
+#         config: model configuration
+#         device: device to use
+#     Returns:
+#         Tuple[Any, torch.Tensor]: (scheduler, timesteps)
+#     """
+#     if args.sample_solver == "unipc":
+#         scheduler = FlowUniPCMultistepScheduler(num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False)
+#         scheduler.set_timesteps(args.infer_steps, device=device, shift=args.flow_shift)
+#         timesteps = scheduler.timesteps
+#     elif args.sample_solver == "dpm++":
+#         scheduler = FlowDPMSolverMultistepScheduler(
+#             num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False
+#         )
+#         sampling_sigmas = get_sampling_sigmas(args.infer_steps, args.flow_shift)
+#         timesteps, _ = retrieve_timesteps(scheduler, device=device, sigmas=sampling_sigmas)
+#     elif args.sample_solver == "vanilla":
+#         scheduler = FlowMatchDiscreteScheduler(num_train_timesteps=config.num_train_timesteps, shift=args.flow_shift)
+#         scheduler.set_timesteps(args.infer_steps, device=device)
+#         timesteps = scheduler.timesteps
+#         # FlowMatchDiscreteScheduler does not support generator argument in step method
+#         org_step = scheduler.step
+#         def step_wrapper(
+#             model_output: torch.Tensor,
+#             timestep: Union[int, torch.Tensor],
+#             sample: torch.Tensor,
+#             return_dict: bool = True,
+#             generator=None,
+#         ):
+#             return org_step(model_output, timestep, sample, return_dict=return_dict)
+#         scheduler.step = step_wrapper
+#     else:
+#         raise NotImplementedError("Unsupported solver.")
+#     return scheduler, timesteps
+def convert_lora_for_framepack(lora_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    # Check the format of the LoRA file
+    keys = list(lora_sd.keys())
+    if keys[0].startswith("lora_unet_"):
+        # logging.info(f"Musubi Tuner LoRA detected")
+        pass
+    else:
+        transformer_prefixes = ["diffusion_model", "transformer"]  # to ignore Text Encoder modules
+        lora_suffix = None
+        prefix = None
+        for key in keys:
+            if lora_suffix is None and "lora_A" in key:
+                lora_suffix = "lora_A"
+            if prefix is None:
+                pfx = key.split(".")[0]
+                if pfx in transformer_prefixes:
+                    prefix = pfx
+            if lora_suffix is not None and prefix is not None:
+                break
+        if lora_suffix == "lora_A" and prefix is not None:
+            logging.info(f"Diffusion-pipe (?) LoRA detected, converting to the default LoRA format")
+            lora_sd = convert_lora_from_diffusion_pipe_or_something(lora_sd, "lora_unet_")
+        else:
+            logging.info(f"LoRA file format not recognized. Using it as-is.")
+    # Check LoRA is for FramePack or for HunyuanVideo
+    is_hunyuan = False
+    for key in lora_sd.keys():
+        if "double_blocks" in key or "single_blocks" in key:
+            is_hunyuan = True
+            break
+    if is_hunyuan:
+        logging.info("HunyuanVideo LoRA detected, converting to FramePack format")
+        lora_sd = convert_hunyuan_to_framepack(lora_sd)
+    return lora_sd
+def convert_lora_from_diffusion_pipe_or_something(lora_sd: dict[str, torch.Tensor], prefix: str) -> dict[str, torch.Tensor]:
+    """
+    Convert LoRA weights to the format used by the diffusion pipeline to Musubi Tuner.
+    Copy from Musubi Tuner repo.
+    """
+    # convert from diffusers(?) to default LoRA
+    # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
+    # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
+    # note: Diffusers has no alpha, so alpha is set to rank
+    new_weights_sd = {}
+    lora_dims = {}
+    for key, weight in lora_sd.items():
+        diffusers_prefix, key_body = key.split(".", 1)
+        if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
+            print(f"unexpected key: {key} in diffusers format")
+            continue
+        new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
+        new_weights_sd[new_key] = weight
+        lora_name = new_key.split(".")[0]  # before first dot
+        if lora_name not in lora_dims and "lora_down" in new_key:
+            lora_dims[lora_name] = weight.shape[0]
+    # add alpha with rank
+    for lora_name, dim in lora_dims.items():
+        new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
+    return new_weights_sd
+def convert_hunyuan_to_framepack(lora_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    """
+    Convert HunyuanVideo LoRA weights to FramePack format.
+    """
+    new_lora_sd = {}
+    for key, weight in lora_sd.items():
+        if "double_blocks" in key:
+            key = key.replace("double_blocks", "transformer_blocks")
+            key = key.replace("img_mod_linear", "norm1_linear")
+            key = key.replace("img_attn_qkv", "attn_to_QKV")  # split later
+            key = key.replace("img_attn_proj", "attn_to_out_0")
+            key = key.replace("img_mlp_fc1", "ff_net_0_proj")
+            key = key.replace("img_mlp_fc2", "ff_net_2")
+            key = key.replace("txt_mod_linear", "norm1_context_linear")
+            key = key.replace("txt_attn_qkv", "attn_add_QKV_proj")  # split later
+            key = key.replace("txt_attn_proj", "attn_to_add_out")
+            key = key.replace("txt_mlp_fc1", "ff_context_net_0_proj")
+            key = key.replace("txt_mlp_fc2", "ff_context_net_2")
+        elif "single_blocks" in key:
+            key = key.replace("single_blocks", "single_transformer_blocks")
+            key = key.replace("linear1", "attn_to_QKVM")  # split later
+            key = key.replace("linear2", "proj_out")
+            key = key.replace("modulation_linear", "norm_linear")
+        else:
+            print(f"Unsupported module name: {key}, only double_blocks and single_blocks are supported")
+            continue
+        if "QKVM" in key:
+            # split QKVM into Q, K, V, M
+            key_q = key.replace("QKVM", "q")
+            key_k = key.replace("QKVM", "k")
+            key_v = key.replace("QKVM", "v")
+            key_m = key.replace("attn_to_QKVM", "proj_mlp")
+            if "_down" in key or "alpha" in key:
+                # copy QKVM weight or alpha to Q, K, V, M
+                assert "alpha" in key or weight.size(1) == 3072, f"QKVM weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight
+                new_lora_sd[key_k] = weight
+                new_lora_sd[key_v] = weight
+                new_lora_sd[key_m] = weight
+            elif "_up" in key:
+                # split QKVM weight into Q, K, V, M
+                assert weight.size(0) == 21504, f"QKVM weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight[:3072]
+                new_lora_sd[key_k] = weight[3072 : 3072 * 2]
+                new_lora_sd[key_v] = weight[3072 * 2 : 3072 * 3]
+                new_lora_sd[key_m] = weight[3072 * 3 :]  # 21504 - 3072 * 3 = 12288
+            else:
+                print(f"Unsupported module name: {key}")
+                continue
+        elif "QKV" in key:
+            # split QKV into Q, K, V
+            key_q = key.replace("QKV", "q")
+            key_k = key.replace("QKV", "k")
+            key_v = key.replace("QKV", "v")
+            if "_down" in key or "alpha" in key:
+                # copy QKV weight or alpha to Q, K, V
+                assert "alpha" in key or weight.size(1) == 3072, f"QKV weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight
+                new_lora_sd[key_k] = weight
+                new_lora_sd[key_v] = weight
+            elif "_up" in key:
+                # split QKV weight into Q, K, V
+                assert weight.size(0) == 3072 * 3, f"QKV weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight[:3072]
+                new_lora_sd[key_k] = weight[3072 : 3072 * 2]
+                new_lora_sd[key_v] = weight[3072 * 2 :]
+            else:
+                print(f"Unsupported module name: {key}")
+                continue
+        else:
+            # no split needed
+            new_lora_sd[key] = weight
+    return new_lora_sd
+def generate(
+    args: argparse.Namespace, gen_settings: GenerationSettings, shared_models: Optional[Dict] = None
+) -> tuple[AutoencoderKLCausal3D, torch.Tensor]:
+    """main function for generation
+    Args:
+        args: command line arguments
+        shared_models: dictionary containing pre-loaded models
+    Returns:
+        tuple: (AutoencoderKLCausal3D model (vae), torch.Tensor generated latent)
+    """
+    device, dit_weight_dtype = (gen_settings.device, gen_settings.dit_weight_dtype)
+    # prepare seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    args.seed = seed  # set seed to args for saving
+    # Check if we have shared models
+    if shared_models is not None:
+        # Use shared models and encoded data
+        vae = shared_models.get("vae")
+        height, width, video_seconds, context, context_null, context_img, end_latents = prepare_i2v_inputs(
+            args, device, vae, shared_models
+        )
+    else:
+        # prepare inputs without shared models
+        vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+        height, width, video_seconds, context, context_null, context_img, end_latents = prepare_i2v_inputs(args, device, vae)
+    if shared_models is None or "model" not in shared_models:
+        # load DiT model
+        model = load_dit_model(args, device)
+        # merge LoRA weights
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            # ugly hack to common merge_lora_weights function
+            merge_lora_weights(lora_framepack, model, args, device, convert_lora_for_framepack)
+            # if we only want to save the model, we can skip the rest
+            if args.save_merged_model:
+                return None, None
+        # optimize model: fp8 conversion, block swap etc.
+        optimize_model(model, args, device)
+        if shared_models is not None:
+            shared_models["model"] = model
+    else:
+        # use shared model
+        model: HunyuanVideoTransformer3DModelPacked = shared_models["model"]
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    # sampling
+    latent_window_size = args.latent_window_size  # default is 9
+    # ex: (5s * 30fps) / (9 * 4) = 4.16 -> 4 sections, 60s -> 1800 / 36 = 50 sections
+    total_latent_sections = (video_seconds * 30) / (latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+    # set random generator
+    seed_g = torch.Generator(device="cpu")
+    seed_g.manual_seed(seed)
+    num_frames = latent_window_size * 4 - 3
+    logger.info(
+        f"Video size: {height}x{width}@{video_seconds} (HxW@seconds), fps: {args.fps}, num sections: {total_latent_sections}, "
+        f"infer_steps: {args.infer_steps}, frames per generation: {num_frames}"
+    )
+    # video generation ######
+    f1_mode = args.f1
+    one_frame_inference = None
+    if args.one_frame_inference is not None:
+        one_frame_inference = set()
+        for mode in args.one_frame_inference.split(","):
+            one_frame_inference.add(mode.strip())
+    # prepare history latents
+    history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+    if end_latents is not None and not f1_mode:
+        logger.info(f"Use end image(s): {args.end_image_path}")
+        for i, end_latent in enumerate(end_latents):
+            history_latents[:, :, i + 1 : i + 2] = end_latent.to(history_latents)
+    # prepare clean latents and indices
+    if not f1_mode:
+        # Inverted Anti-drifting
+        total_generated_latent_frames = 0
+        latent_paddings = reversed(range(total_latent_sections))
+        if total_latent_sections > 4 and one_frame_inference is None:
+            # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
+            # items looks better than expanding it when total_latent_sections > 4
+            # One can try to remove below trick and just
+            # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
+            # 4 sections: 3, 2, 1, 0. 50 sections: 3, 2, 2, ... 2, 1, 0
+            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+        if args.latent_paddings is not None:
+            # parse user defined latent paddings
+            user_latent_paddings = [int(x) for x in args.latent_paddings.split(",")]
+            if len(user_latent_paddings) < total_latent_sections:
+                print(
+                    f"User defined latent paddings length {len(user_latent_paddings)} does not match total sections {total_latent_sections}."
+                )
+                print(f"Use default paddings instead for unspecified sections.")
+                latent_paddings[: len(user_latent_paddings)] = user_latent_paddings
+            elif len(user_latent_paddings) > total_latent_sections:
+                print(
+                    f"User defined latent paddings length {len(user_latent_paddings)} is greater than total sections {total_latent_sections}."
+                )
+                print(f"Use only first {total_latent_sections} paddings instead.")
+                latent_paddings = user_latent_paddings[:total_latent_sections]
+            else:
+                latent_paddings = user_latent_paddings
+    else:
+        start_latent = context_img[0]["start_latent"]
+        history_latents = torch.cat([history_latents, start_latent], dim=2)
+        total_generated_latent_frames = 1  # a bit hacky, but we employ the same logic as in official code
+        latent_paddings = [0] * total_latent_sections  # dummy paddings for F1 mode
+    latent_paddings = list(latent_paddings)  # make sure it's a list
+    for loop_index in range(total_latent_sections):
+        latent_padding = latent_paddings[loop_index]
+        if not f1_mode:
+            # Inverted Anti-drifting
+            section_index_reverse = loop_index  # 0, 1, 2, 3
+            section_index = total_latent_sections - 1 - section_index_reverse  # 3, 2, 1, 0
+            section_index_from_last = -(section_index_reverse + 1)  # -1, -2, -3, -4
+            is_last_section = section_index == 0
+            is_first_section = section_index_reverse == 0
+            latent_padding_size = latent_padding * latent_window_size
+            logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+        else:
+            section_index = loop_index  # 0, 1, 2, 3
+            section_index_from_last = section_index - total_latent_sections  # -4, -3, -2, -1
+            is_last_section = loop_index == total_latent_sections - 1
+            is_first_section = loop_index == 0
+            latent_padding_size = 0  # dummy padding for F1 mode
+        # select start latent
+        if section_index_from_last in context_img:
+            image_index = section_index_from_last
+        elif section_index in context_img:
+            image_index = section_index
+        else:
+            image_index = 0
+        start_latent = context_img[image_index]["start_latent"]
+        image_path = context_img[image_index]["image_path"]
+        if image_index != 0:  # use section image other than section 0
+            logger.info(f"Apply experimental section image, latent_padding_size = {latent_padding_size}, image_path = {image_path}")
+        if not f1_mode:
+            # Inverted Anti-drifting
+            indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+            (
+                clean_latent_indices_pre,
+                blank_indices,
+                latent_indices,
+                clean_latent_indices_post,
+                clean_latent_2x_indices,
+                clean_latent_4x_indices,
+            ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+            clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+            clean_latents_pre = start_latent.to(history_latents)
+            clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                [1, 2, 16], dim=2
+            )
+            clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+            if end_latents is not None:
+                clean_latents = torch.cat([clean_latents_pre, history_latents[:, :, : len(end_latents)]], dim=2)
+                clean_latent_indices_extended = torch.zeros(1, 1 + len(end_latents), dtype=clean_latent_indices.dtype)
+                clean_latent_indices_extended[:, :2] = clean_latent_indices
+                clean_latent_indices = clean_latent_indices_extended
+        else:
+            # F1 mode
+            indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+            (
+                clean_latent_indices_start,
+                clean_latent_4x_indices,
+                clean_latent_2x_indices,
+                clean_latent_1x_indices,
+                latent_indices,
+            ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+            clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]) :, :, :].split(
+                [16, 2, 1], dim=2
+            )
+            clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
+        # if use_teacache:
+        #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+        # else:
+        #     transformer.initialize_teacache(enable_teacache=False)
+        # prepare conditioning inputs
+        if section_index_from_last in context:
+            prompt_index = section_index_from_last
+        elif section_index in context:
+            prompt_index = section_index
+        else:
+            prompt_index = 0
+        context_for_index = context[prompt_index]
+        # if args.section_prompts is not None:
+        logger.info(f"Section {section_index}: {context_for_index['prompt']}")
+        llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
+        llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
+        clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+        image_encoder_last_hidden_state = context_img[image_index]["image_encoder_last_hidden_state"].to(
+            device, dtype=torch.bfloat16
+        )
+        llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
+        llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
+        clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+        # call DiT model to generate latents
+        sample_num_frames = num_frames
+        if one_frame_inference is not None:
+            # one frame inference
+            latent_indices = latent_indices[:, -1:]  # only use the last frame (default)
+            sample_num_frames = 1
+            def get_latent_mask(mask_path: str):
+                mask_image = Image.open(mask_path).convert("L")  # grayscale
+                mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+                mask_image = np.array(mask_image)  # PIL to numpy, HWC
+                mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+                mask_image = mask_image.squeeze(-1)  # HWC -> HW
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)  # HW -> 11HW
+                mask_image = mask_image.to(clean_latents)
+                return mask_image
+            if args.image_mask_path is not None:
+                mask_image = get_latent_mask(args.image_mask_path)
+                logger.info(f"Apply mask for clean latents (start image): {args.image_mask_path}, shape: {mask_image.shape}")
+                clean_latents[:, :, 0, :, :] = clean_latents[:, :, 0, :, :] * mask_image
+            if args.end_image_mask_path is not None and len(args.end_image_mask_path) > 0:
+                # # apply mask for clean latents 1x (end image)
+                count = min(len(args.end_image_mask_path), len(end_latents))
+                for i in range(count):
+                    mask_image = get_latent_mask(args.end_image_mask_path[i])
+                    logger.info(
+                        f"Apply mask for clean latents 1x (end image) for {i+1}: {args.end_image_mask_path[i]}, shape: {mask_image.shape}"
+                    )
+                    clean_latents[:, :, i + 1 : i + 2, :, :] = clean_latents[:, :, i + 1 : i + 2, :, :] * mask_image
+            for one_frame_param in one_frame_inference:
+                if one_frame_param.startswith("target_index="):
+                    target_index = int(one_frame_param.split("=")[1])
+                    latent_indices[:, 0] = target_index
+                    logger.info(f"Set index for target: {target_index}")
+                elif one_frame_param.startswith("start_index="):
+                    start_index = int(one_frame_param.split("=")[1])
+                    clean_latent_indices[:, 0] = start_index
+                    logger.info(f"Set index for clean latent pre (start image): {start_index}")
+                elif one_frame_param.startswith("history_index="):
+                    history_indices = one_frame_param.split("=")[1].split(";")
+                    i = 0
+                    while i < len(history_indices) and i < len(end_latents):
+                        history_index = int(history_indices[i])
+                        clean_latent_indices[:, 1 + i] = history_index
+                        i += 1
+                    while i < len(end_latents):
+                        clean_latent_indices[:, 1 + i] = history_index
+                        i += 1
+                    logger.info(f"Set index for clean latent post (end image): {history_indices}")
+            if "no_2x" in one_frame_inference:
+                clean_latents_2x = None
+                clean_latent_2x_indices = None
+                logger.info(f"No clean_latents_2x")
+            if "no_4x" in one_frame_inference:
+                clean_latents_4x = None
+                clean_latent_4x_indices = None
+                logger.info(f"No clean_latents_4x")
+            if "no_post" in one_frame_inference:
+                clean_latents = clean_latents[:, :, :1, :, :]
+                clean_latent_indices = clean_latent_indices[:, :1]
+                logger.info(f"No clean_latents post")
+            elif "zero_post" in one_frame_inference:
+                # zero out the history latents. this seems to prevent the images from corrupting
+                clean_latents[:, :, 1:, :, :] = torch.zeros_like(clean_latents[:, :, 1:, :, :])
+                logger.info(f"Zero out clean_latents post")
+            logger.info(
+                f"One frame inference. clean_latent: {clean_latents.shape} latent_indices: {latent_indices}, clean_latent_indices: {clean_latent_indices}, num_frames: {sample_num_frames}"
+            )
+        generated_latents = sample_hunyuan(
+            transformer=model,
+            sampler=args.sample_solver,
+            width=width,
+            height=height,
+            frames=sample_num_frames,
+            real_guidance_scale=args.guidance_scale,
+            distilled_guidance_scale=args.embedded_cfg_scale,
+            guidance_rescale=args.guidance_rescale,
+            # shift=3.0,
+            num_inference_steps=args.infer_steps,
+            generator=seed_g,
+            prompt_embeds=llama_vec,
+            prompt_embeds_mask=llama_attention_mask,
+            prompt_poolers=clip_l_pooler,
+            negative_prompt_embeds=llama_vec_n,
+            negative_prompt_embeds_mask=llama_attention_mask_n,
+            negative_prompt_poolers=clip_l_pooler_n,
+            device=device,
+            dtype=torch.bfloat16,
+            image_embeddings=image_encoder_last_hidden_state,
+            latent_indices=latent_indices,
+            clean_latents=clean_latents,
+            clean_latent_indices=clean_latent_indices,
+            clean_latents_2x=clean_latents_2x,
+            clean_latent_2x_indices=clean_latent_2x_indices,
+            clean_latents_4x=clean_latents_4x,
+            clean_latent_4x_indices=clean_latent_4x_indices,
+        )
+        # concatenate generated latents
+        total_generated_latent_frames += int(generated_latents.shape[2])
+        if not f1_mode:
+            # Inverted Anti-drifting: prepend generated latents to history latents
+            if is_last_section:
+                generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+                total_generated_latent_frames += 1
+            history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+            real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+        else:
+            # F1 mode: append generated latents to history latents
+            history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
+            real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
+        logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+        # # TODO support saving intermediate video
+        # clean_memory_on_device(device)
+        # vae.to(device)
+        # if history_pixels is None:
+        #     history_pixels = hunyuan.vae_decode(real_history_latents, vae).cpu()
+        # else:
+        #     section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+        #     overlapped_frames = latent_window_size * 4 - 3
+        #     current_pixels = hunyuan.vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
+        #     history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+        # vae.to("cpu")
+        # # if not is_last_section:
+        # #     # save intermediate video
+        # #     save_video(history_pixels[0], args, total_generated_latent_frames)
+        # print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
+    if one_frame_inference is not None:
+        real_history_latents = real_history_latents[:, :, 1:, :, :]  # remove the first frame (start_latent)
+    # Only clean up shared models if they were created within this function
+    if shared_models is None:
+        del model  # free memory
+        synchronize_device(device)
+    else:
+        # move model to CPU to save memory
+        model.to("cpu")
+    # wait for 5 seconds until block swap is done
+    logger.info("Waiting for 5 seconds to finish block swap")
+    time.sleep(5)
+    gc.collect()
+    clean_memory_on_device(device)
+    return vae, real_history_latents
+def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
+    """Save latent to file
+    Args:
+        latent: Latent tensor
+        args: command line arguments
+        height: height of frame
+        width: width of frame
+    Returns:
+        str: Path to saved latent file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    video_seconds = args.video_seconds
+    latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
+    if args.no_metadata:
+        metadata = None
+    else:
+        metadata = {
+            "seeds": f"{seed}",
+            "prompt": f"{args.prompt}",
+            "height": f"{height}",
+            "width": f"{width}",
+            "video_seconds": f"{video_seconds}",
+            "infer_steps": f"{args.infer_steps}",
+            "guidance_scale": f"{args.guidance_scale}",
+            "latent_window_size": f"{args.latent_window_size}",
+            "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
+            "guidance_rescale": f"{args.guidance_rescale}",
+            "sample_solver": f"{args.sample_solver}",
+            "latent_window_size": f"{args.latent_window_size}",
+            "fps": f"{args.fps}",
+        }
+        if args.negative_prompt is not None:
+            metadata["negative_prompt"] = f"{args.negative_prompt}"
+    sd = {"latent": latent.contiguous()}
+    save_file(sd, latent_path, metadata=metadata)
+    logger.info(f"Latent saved to: {latent_path}")
+    return latent_path
+def save_video(
+    video: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None, latent_frames: Optional[int] = None
+) -> str:
+    """Save video to file
+    Args:
+        video: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved video file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    latent_frames = "" if latent_frames is None else f"_{latent_frames}"
+    video_path = f"{save_path}/{time_flag}_{seed}{original_name}{latent_frames}.mp4"
+    video = video.unsqueeze(0)
+    save_videos_grid(video, video_path, fps=args.fps, rescale=True)
+    logger.info(f"Video saved to: {video_path}")
+    return video_path
+def save_images(sample: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
+    """Save images to directory
+    Args:
+        sample: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved images directory
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    image_name = f"{time_flag}_{seed}{original_name}"
+    sample = sample.unsqueeze(0)
+    one_frame_mode = args.one_frame_inference is not None
+    save_images_grid(sample, save_path, image_name, rescale=True, create_subdir=not one_frame_mode)
+    logger.info(f"Sample images saved to: {save_path}/{image_name}")
+    return f"{save_path}/{image_name}"
+def save_output(
+    args: argparse.Namespace,
+    vae: AutoencoderKLCausal3D,
+    latent: torch.Tensor,
+    device: torch.device,
+    original_base_names: Optional[List[str]] = None,
+) -> None:
+    """save output
+    Args:
+        args: command line arguments
+        vae: VAE model
+        latent: latent tensor
+        device: device to use
+        original_base_names: original base names (if latents are loaded from files)
+    """
+    height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
+    height *= 8
+    width *= 8
+    # print(f"Saving output. Latent shape {latent.shape}; pixel shape {height}x{width}")
+    if args.output_type == "latent" or args.output_type == "both" or args.output_type == "latent_images":
+        # save latent
+        save_latent(latent, args, height, width)
+    if args.output_type == "latent":
+        return
+    total_latent_sections = (args.video_seconds * 30) / (args.latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+    video = decode_latent(
+        args.latent_window_size, total_latent_sections, args.bulk_decode, vae, latent, device, args.one_frame_inference is not None
+    )
+    if args.output_type == "video" or args.output_type == "both":
+        # save video
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_video(video, args, original_name)
+    elif args.output_type == "images" or args.output_type == "latent_images":
+        # save images
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_images(video, args, original_name)
+def preprocess_prompts_for_batch(prompt_lines: List[str], base_args: argparse.Namespace) -> List[Dict]:
+    """Process multiple prompts for batch mode
+    Args:
+        prompt_lines: List of prompt lines
+        base_args: Base command line arguments
+    Returns:
+        List[Dict]: List of prompt data dictionaries
+    """
+    prompts_data = []
+    for line in prompt_lines:
+        line = line.strip()
+        if not line or line.startswith("#"):  # Skip empty lines and comments
+            continue
+        # Parse prompt line and create override dictionary
+        prompt_data = parse_prompt_line(line)
+        logger.info(f"Parsed prompt data: {prompt_data}")
+        prompts_data.append(prompt_data)
+    return prompts_data
+def load_shared_models(args: argparse.Namespace) -> Dict:
+    """Load shared models for batch processing or interactive mode.
+    Models are loaded to CPU to save memory.
+    Args:
+        args: Base command line arguments
+    Returns:
+        Dict: Dictionary of shared models
+    """
+    shared_models = {}
+    tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, "cpu")
+    tokenizer2, text_encoder2 = load_text_encoder2(args)
+    feature_extractor, image_encoder = load_image_encoders(args)
+    vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, "cpu")
+    shared_models["tokenizer1"] = tokenizer1
+    shared_models["text_encoder1"] = text_encoder1
+    shared_models["tokenizer2"] = tokenizer2
+    shared_models["text_encoder2"] = text_encoder2
+    shared_models["feature_extractor"] = feature_extractor
+    shared_models["image_encoder"] = image_encoder
+    shared_models["vae"] = vae
+    return shared_models
+def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) -> None:
+    """Process multiple prompts with model reuse
+    Args:
+        prompts_data: List of prompt data dictionaries
+        args: Base command line arguments
+    """
+    if not prompts_data:
+        logger.warning("No valid prompts found")
+        return
+    # 1. Load configuration
+    gen_settings = get_generation_settings(args)
+    device = gen_settings.device
+    # 2. Load models to CPU in advance except for VAE and DiT
+    shared_models = load_shared_models(args)
+    # 3. Generate for each prompt
+    all_latents = []
+    all_prompt_args = []
+    with torch.no_grad():
+        for prompt_data in prompts_data:
+            prompt = prompt_data["prompt"]
+            prompt_args = apply_overrides(args, prompt_data)
+            logger.info(f"Processing prompt: {prompt}")
+            try:
+                vae, latent = generate(prompt_args, gen_settings, shared_models)
+                # Save latent if needed
+                if args.output_type == "latent" or args.output_type == "both" or args.output_type == "latent_images":
+                    height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
+                    height *= 8
+                    width *= 8
+                    save_latent(latent, prompt_args, height, width)
+                all_latents.append(latent)
+                all_prompt_args.append(prompt_args)
+            except Exception as e:
+                logger.error(f"Error processing prompt: {prompt}. Error: {e}")
+                continue
+    # 4. Free models
+    if "model" in shared_models:
+        del shared_models["model"]
+    del shared_models["tokenizer1"]
+    del shared_models["text_encoder1"]
+    del shared_models["tokenizer2"]
+    del shared_models["text_encoder2"]
+    del shared_models["feature_extractor"]
+    del shared_models["image_encoder"]
+    clean_memory_on_device(device)
+    synchronize_device(device)
+    # 5. Decode latents if needed
+    if args.output_type != "latent":
+        logger.info("Decoding latents to videos/images")
+        vae.to(device)
+        for i, (latent, prompt_args) in enumerate(zip(all_latents, all_prompt_args)):
+            logger.info(f"Decoding output {i+1}/{len(all_latents)}")
+            # avoid saving latents again (ugly hack)
+            if prompt_args.output_type == "both":
+                prompt_args.output_type = "video"
+            elif prompt_args.output_type == "latent_images":
+                prompt_args.output_type = "images"
+            save_output(prompt_args, vae, latent[0], device)
+def process_interactive(args: argparse.Namespace) -> None:
+    """Process prompts in interactive mode
+    Args:
+        args: Base command line arguments
+    """
+    gen_settings = get_generation_settings(args)
+    device = gen_settings.device
+    shared_models = load_shared_models(args)
+    print("Interactive mode. Enter prompts (Ctrl+D or Ctrl+Z (Windows) to exit):")
+    try:
+        while True:
+            try:
+                line = input("> ")
+                if not line.strip():
+                    continue
+                # Parse prompt
+                prompt_data = parse_prompt_line(line)
+                prompt_args = apply_overrides(args, prompt_data)
+                # Generate latent
+                vae, latent = generate(prompt_args, gen_settings, shared_models)
+                # Save latent and video
+                save_output(prompt_args, vae, latent[0], device)
+            except KeyboardInterrupt:
+                print("\nInterrupted. Continue (Ctrl+D or Ctrl+Z (Windows) to exit)")
+                continue
+    except EOFError:
+        print("\nExiting interactive mode")
+def get_generation_settings(args: argparse.Namespace) -> GenerationSettings:
+    device = torch.device(args.device)
+    dit_weight_dtype = None  # default
+    if args.fp8_scaled:
+        dit_weight_dtype = None  # various precision weights, so don't cast to specific dtype
+    elif args.fp8:
+        dit_weight_dtype = torch.float8_e4m3fn
+    logger.info(f"Using device: {device}, DiT weight weight precision: {dit_weight_dtype}")
+    gen_settings = GenerationSettings(device=device, dit_weight_dtype=dit_weight_dtype)
+    return gen_settings
+def main():
+    # Parse arguments
+    args = parse_args()
+    # Check if latents are provided
+    latents_mode = args.latent_path is not None and len(args.latent_path) > 0
+    # Set device
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    logger.info(f"Using device: {device}")
+    args.device = device
+    if latents_mode:
+        # Original latent decode mode
+        original_base_names = []
+        latents_list = []
+        seeds = []
+        # assert len(args.latent_path) == 1, "Only one latent path is supported for now"
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+                if "height" in metadata and "width" in metadata:
+                    height = int(metadata["height"])
+                    width = int(metadata["width"])
+                    args.video_size = [height, width]
+                if "video_seconds" in metadata:
+                    args.video_seconds = float(metadata["video_seconds"])
+            seeds.append(seed)
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+            if latents.ndim == 5:  # [BCTHW]
+                latents = latents.squeeze(0)  # [CTHW]
+            latents_list.append(latents)
+        # latent = torch.stack(latents_list, dim=0)  # [N, ...], must be same shape
+        for i, latent in enumerate(latents_list):
+            args.seed = seeds[i]
+            vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+            save_output(args, vae, latent, device, original_base_names)
+    elif args.from_file:
+        # Batch mode from file
+        # Read prompts from file
+        with open(args.from_file, "r", encoding="utf-8") as f:
+            prompt_lines = f.readlines()
+        # Process prompts
+        prompts_data = preprocess_prompts_for_batch(prompt_lines, args)
+        process_batch_prompts(prompts_data, args)
+    elif args.interactive:
+        # Interactive mode
+        process_interactive(args)
+    else:
+        # Single prompt mode (original behavior)
+        # Generate latent
+        gen_settings = get_generation_settings(args)
+        vae, latent = generate(args, gen_settings)
+        # print(f"Generated latent shape: {latent.shape}")
+        if args.save_merged_model:
+            return
+        # Save latent and video
+        save_output(args, vae, latent[0], device)
+    logger.info("Done!")
+if __name__ == "__main__":
+    main()

frame_pack/__init__.py ADDED Viewed

File without changes

frame_pack/bucket_tools.py ADDED Viewed

	@@ -0,0 +1,30 @@

+bucket_options = {
+    640: [
+        (416, 960),
+        (448, 864),
+        (480, 832),
+        (512, 768),
+        (544, 704),
+        (576, 672),
+        (608, 640),
+        (640, 608),
+        (672, 576),
+        (704, 544),
+        (768, 512),
+        (832, 480),
+        (864, 448),
+        (960, 416),
+    ],
+}
+def find_nearest_bucket(h, w, resolution=640):
+    min_metric = float('inf')
+    best_bucket = None
+    for (bucket_h, bucket_w) in bucket_options[resolution]:
+        metric = abs(h * bucket_w - w * bucket_h)
+        if metric <= min_metric:
+            min_metric = metric
+            best_bucket = (bucket_h, bucket_w)
+    return best_bucket

frame_pack/clip_vision.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import numpy as np
+def hf_clip_vision_encode(image, feature_extractor, image_encoder):
+    assert isinstance(image, np.ndarray)
+    assert image.ndim == 3 and image.shape[2] == 3
+    assert image.dtype == np.uint8
+    preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(
+        device=image_encoder.device, dtype=image_encoder.dtype
+    )
+    image_encoder_output = image_encoder(**preprocessed)
+    return image_encoder_output

frame_pack/framepack_utils.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import logging
+from types import SimpleNamespace
+from typing import Optional, Union
+import accelerate
+from accelerate import Accelerator, init_empty_weights
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    LlamaTokenizerFast,
+    LlamaConfig,
+    LlamaModel,
+    CLIPTokenizer,
+    CLIPTextModel,
+    CLIPConfig,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    SiglipVisionConfig,
+)
+from utils.safetensors_utils import load_split_weights
+from hunyuan_model.vae import load_vae as hunyuan_load_vae
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def load_vae(
+    vae_path: str, vae_chunk_size: Optional[int], vae_spatial_tile_sample_min_size: Optional[int], device: Union[str, torch.device]
+):
+    # single file and directory (contains 'vae') support
+    if os.path.isdir(vae_path):
+        vae_path = os.path.join(vae_path, "vae", "diffusion_pytorch_model.safetensors")
+    else:
+        vae_path = vae_path
+    vae_dtype = torch.float16  # if vae_dtype is None else str_to_dtype(vae_dtype)
+    vae, _, s_ratio, t_ratio = hunyuan_load_vae(vae_dtype=vae_dtype, device=device, vae_path=vae_path)
+    vae.eval()
+    # vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+    # set chunk_size to CausalConv3d recursively
+    chunk_size = vae_chunk_size
+    if chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(chunk_size)
+        logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")
+    if vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = vae_spatial_tile_sample_min_size // 8
+        logger.info(f"Enabled spatial tiling with min size {vae_spatial_tile_sample_min_size}")
+    # elif vae_tiling:
+    else:
+        vae.enable_spatial_tiling(True)
+    return vae
+# region Text Encoders
+# Text Encoder configs are copied from HunyuanVideo repo
+LLAMA_CONFIG = {
+    "architectures": ["LlamaModel"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 8192,
+    "mlp_bias": False,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": False,
+    "torch_dtype": "float16",
+    "transformers_version": "4.46.3",
+    "use_cache": True,
+    "vocab_size": 128320,
+}
+CLIP_CONFIG = {
+    #   "_name_or_path": "/raid/aryan/llava-llama-3-8b-v1_1-extracted/text_encoder_2",
+    "architectures": ["CLIPTextModel"],
+    "attention_dropout": 0.0,
+    "bos_token_id": 0,
+    "dropout": 0.0,
+    "eos_token_id": 2,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 77,
+    "model_type": "clip_text_model",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "projection_dim": 768,
+    "torch_dtype": "float16",
+    "transformers_version": "4.48.0.dev0",
+    "vocab_size": 49408,
+}
+def load_text_encoder1(
+    args, fp8_llm: Optional[bool] = False, device: Optional[Union[str, torch.device]] = None
+) -> tuple[LlamaTokenizerFast, LlamaModel]:
+    # single file, split file and directory (contains 'text_encoder') support
+    logger.info(f"Loading text encoder 1 tokenizer")
+    tokenizer1 = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
+    logger.info(f"Loading text encoder 1 from {args.text_encoder1}")
+    if os.path.isdir(args.text_encoder1):
+        # load from directory, configs are in the directory
+        text_encoder1 = LlamaModel.from_pretrained(args.text_encoder1, subfolder="text_encoder", torch_dtype=torch.float16)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = LlamaConfig(**LLAMA_CONFIG)
+        with init_empty_weights():
+            text_encoder1 = LlamaModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_split_weights(args.text_encoder1)
+        # support weights from ComfyUI
+        if "model.embed_tokens.weight" in state_dict:
+            for key in list(state_dict.keys()):
+                if key.startswith("model."):
+                    new_key = key.replace("model.", "")
+                    state_dict[new_key] = state_dict[key]
+                    del state_dict[key]
+        if "tokenizer" in state_dict:
+            state_dict.pop("tokenizer")
+        if "lm_head.weight" in state_dict:
+            state_dict.pop("lm_head.weight")
+        # # support weights from ComfyUI
+        # if "tokenizer" in state_dict:
+        #     state_dict.pop("tokenizer")
+        text_encoder1.load_state_dict(state_dict, strict=True, assign=True)
+    if fp8_llm:
+        org_dtype = text_encoder1.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder1.to(device=device, dtype=torch.float8_e4m3fn)
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                return forward
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+        prepare_fp8(text_encoder1, org_dtype)
+    else:
+        text_encoder1.to(device)
+    text_encoder1.eval()
+    return tokenizer1, text_encoder1
+def load_text_encoder2(args) -> tuple[CLIPTokenizer, CLIPTextModel]:
+    # single file and directory (contains 'text_encoder_2') support
+    logger.info(f"Loading text encoder 2 tokenizer")
+    tokenizer2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
+    logger.info(f"Loading text encoder 2 from {args.text_encoder2}")
+    if os.path.isdir(args.text_encoder2):
+        # load from directory, configs are in the directory
+        text_encoder2 = CLIPTextModel.from_pretrained(args.text_encoder2, subfolder="text_encoder_2", torch_dtype=torch.float16)
+    else:
+        # we only have one file, so we can load it directly
+        config = CLIPConfig(**CLIP_CONFIG)
+        with init_empty_weights():
+            text_encoder2 = CLIPTextModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_file(args.text_encoder2)
+        text_encoder2.load_state_dict(state_dict, strict=True, assign=True)
+    text_encoder2.eval()
+    return tokenizer2, text_encoder2
+# endregion
+# region image encoder
+# Siglip configs are copied from FramePack repo
+FEATURE_EXTRACTOR_CONFIG = {
+    "do_convert_rgb": None,
+    "do_normalize": True,
+    "do_rescale": True,
+    "do_resize": True,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_processor_type": "SiglipImageProcessor",
+    "image_std": [0.5, 0.5, 0.5],
+    "processor_class": "SiglipProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {"height": 384, "width": 384},
+}
+IMAGE_ENCODER_CONFIG = {
+    "_name_or_path": "/home/lvmin/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-Redux-dev/snapshots/1282f955f706b5240161278f2ef261d2a29ad649/image_encoder",
+    "architectures": ["SiglipVisionModel"],
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.46.2",
+}
+def load_image_encoders(args):
+    logger.info(f"Loading image encoder feature extractor")
+    feature_extractor = SiglipImageProcessor(**FEATURE_EXTRACTOR_CONFIG)
+    # single file, split file and directory (contains 'image_encoder') support
+    logger.info(f"Loading image encoder from {args.image_encoder}")
+    if os.path.isdir(args.image_encoder):
+        # load from directory, configs are in the directory
+        image_encoder = SiglipVisionModel.from_pretrained(args.image_encoder, subfolder="image_encoder", torch_dtype=torch.float16)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = SiglipVisionConfig(**IMAGE_ENCODER_CONFIG)
+        with init_empty_weights():
+            image_encoder = SiglipVisionModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_file(args.image_encoder)
+        image_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    image_encoder.eval()
+    return feature_extractor, image_encoder
+# endregion

frame_pack/hunyuan.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import torch
+# from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
+# from diffusers_helper.utils import crop_or_pad_yield_mask
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from hunyuan_model.text_encoder import PROMPT_TEMPLATE
+@torch.no_grad()
+def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256, custom_system_prompt=None):
+    assert isinstance(prompt, str)
+    prompt = [prompt]
+    # LLAMA
+    # We can verify crop_start by checking the token count of the prompt:
+    # custom_system_prompt = (
+    #     "Describe the video by detailing the following aspects: "
+    #     "1. The main content and theme of the video."
+    #     "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    #     "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    #     "4. background environment, light, style and atmosphere."
+    #     "5. camera angles, movements, and transitions used in the video:"
+    # )
+    if custom_system_prompt is None:
+        prompt_llama = [PROMPT_TEMPLATE["dit-llm-encode-video"]["template"].format(p) for p in prompt]
+        crop_start = PROMPT_TEMPLATE["dit-llm-encode-video"]["crop_start"]
+    else:
+        # count tokens for custom_system_prompt
+        full_prompt = f"<|start_header_id|>system<|end_header_id|>\n\n{custom_system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+        print(f"Custom system prompt: {full_prompt}")
+        system_prompt_tokens = tokenizer(full_prompt, return_tensors="pt", truncation=True).input_ids[0].shape[0]
+        print(f"Custom system prompt token count: {system_prompt_tokens}")
+        prompt_llama = [full_prompt + p + "<|eot_id|>" for p in prompt]
+        crop_start = system_prompt_tokens
+    llama_inputs = tokenizer(
+        prompt_llama,
+        padding="max_length",
+        max_length=max_length + crop_start,
+        truncation=True,
+        return_tensors="pt",
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_attention_mask=True,
+    )
+    llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
+    llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
+    llama_attention_length = int(llama_attention_mask.sum())
+    llama_outputs = text_encoder(
+        input_ids=llama_input_ids,
+        attention_mask=llama_attention_mask,
+        output_hidden_states=True,
+    )
+    llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
+    # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
+    llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
+    assert torch.all(llama_attention_mask.bool())
+    # CLIP
+    clip_l_input_ids = tokenizer_2(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_overflowing_tokens=False,
+        return_length=False,
+        return_tensors="pt",
+    ).input_ids
+    clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
+    return llama_vec, clip_l_pooler
+@torch.no_grad()
+def vae_decode_fake(latents):
+    latent_rgb_factors = [
+        [-0.0395, -0.0331, 0.0445],
+        [0.0696, 0.0795, 0.0518],
+        [0.0135, -0.0945, -0.0282],
+        [0.0108, -0.0250, -0.0765],
+        [-0.0209, 0.0032, 0.0224],
+        [-0.0804, -0.0254, -0.0639],
+        [-0.0991, 0.0271, -0.0669],
+        [-0.0646, -0.0422, -0.0400],
+        [-0.0696, -0.0595, -0.0894],
+        [-0.0799, -0.0208, -0.0375],
+        [0.1166, 0.1627, 0.0962],
+        [0.1165, 0.0432, 0.0407],
+        [-0.2315, -0.1920, -0.1355],
+        [-0.0270, 0.0401, -0.0821],
+        [-0.0616, -0.0997, -0.0727],
+        [0.0249, -0.0469, -0.1703],
+    ]  # From comfyui
+    latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
+    weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
+    bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
+    images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
+    images = images.clamp(0.0, 1.0)
+    return images
+@torch.no_grad()
+def vae_decode(latents, vae, image_mode=False) -> torch.Tensor:
+    latents = latents / vae.config.scaling_factor
+    if not image_mode:
+        image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
+    else:
+        latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
+        image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
+        image = torch.cat(image, dim=2)
+    return image
+@torch.no_grad()
+def vae_encode(image, vae: AutoencoderKLCausal3D) -> torch.Tensor:
+    latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
+    latents = latents * vae.config.scaling_factor
+    return latents

frame_pack/hunyuan_video_packed.py ADDED Viewed

	@@ -0,0 +1,2015 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import glob
+import math
+import numbers
+import os
+from types import SimpleNamespace
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import einops
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from modules.custom_offloading_utils import ModelOffloader
+from utils.safetensors_utils import load_split_weights
+from modules.fp8_optimization_utils import apply_fp8_monkey_patch, optimize_state_dict_with_fp8
+from accelerate import init_empty_weights
+try:
+    # raise NotImplementedError
+    from xformers.ops import memory_efficient_attention as xformers_attn_func
+    print("Xformers is installed!")
+except:
+    print("Xformers is not installed!")
+    xformers_attn_func = None
+try:
+    # raise NotImplementedError
+    from flash_attn import flash_attn_varlen_func, flash_attn_func
+    print("Flash Attn is installed!")
+except:
+    print("Flash Attn is not installed!")
+    flash_attn_varlen_func = None
+    flash_attn_func = None
+try:
+    # raise NotImplementedError
+    from sageattention import sageattn_varlen, sageattn
+    print("Sage Attn is installed!")
+except:
+    print("Sage Attn is not installed!")
+    sageattn_varlen = None
+    sageattn = None
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# region diffusers
+# copied from diffusers with some modifications to minimize dependencies
+# original code: https://github.com/huggingface/diffusers/
+# original license: Apache-2.0
+ACT2CLS = {
+    "swish": nn.SiLU,
+    "silu": nn.SiLU,
+    "mish": nn.Mish,
+    "gelu": nn.GELU,
+    "relu": nn.ReLU,
+}
+def get_activation(act_fn: str) -> nn.Module:
+    """Helper function to get activation function from string.
+    Args:
+        act_fn (str): Name of activation function.
+    Returns:
+        nn.Module: Activation function.
+    """
+    act_fn = act_fn.lower()
+    if act_fn in ACT2CLS:
+        return ACT2CLS[act_fn]()
+    else:
+        raise ValueError(f"activation function {act_fn} not found in ACT2FN mapping {list(ACT2CLS.keys())}")
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+class FP32SiLU(nn.Module):
+    r"""
+    SiLU activation function with input upcasted to torch.float32.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.silu(inputs.float(), inplace=False).to(inputs.dtype)
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        # if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+        #     # fp16 gelu not supported on mps before torch 2.0
+        #     return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        return F.gelu(gate, approximate=self.approximate)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class LayerNormFramePack(nn.LayerNorm):
+    # casting to dtype of input tensor is added
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
+class FP32LayerNormFramePack(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        origin_dtype = x.dtype
+        return torch.nn.functional.layer_norm(
+            x.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        ).to(origin_dtype)
+class RMSNormFramePack(nn.Module):
+    r"""
+    RMS Norm as introduced in https://arxiv.org/abs/1910.07467 by Zhang et al.
+    Args:
+        dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
+        eps (`float`): Small value to use when calculating the reciprocal of the square-root.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        bias (`bool`, defaults to False): If also training the `bias` param.
+    """
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
+        super().__init__()
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        self.weight = None
+        self.bias = None
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(dim))
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is None:
+            return hidden_states.to(input_dtype)
+        return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
+class AdaLayerNormContinuousFramePack(nn.Module):
+    r"""
+    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
+    Args:
+        embedding_dim (`int`): Embedding dimension to use during projection.
+        conditioning_embedding_dim (`int`): Dimension of the input condition.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        eps (`float`, defaults to 1e-5): Epsilon factor.
+        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNormFramePack(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(self, x, conditioning_embedding):
+        emb = self.linear(self.silu(conditioning_embedding))
+        scale, shift = emb.chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+class LinearActivation(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.activation = get_activation(activation)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        return self.activation(hidden_states)
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        # if activation_fn == "gelu":
+        #     act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        # elif activation_fn == "geglu":
+        #     act_fn = GEGLU(dim, inner_dim, bias=bias)
+        # elif activation_fn == "geglu-approximate":
+        #     act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        # elif activation_fn == "swiglu":
+        #     act_fn = SwiGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "linear-silu":
+            act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu")
+        else:
+            raise ValueError(f"Unknown activation function: {activation_fn}")
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            # deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            # deprecate("scale", "1.0.0", deprecation_message)
+            raise ValueError("scale is not supported in this version. Please remove it.")
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+# @maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    Minimal copy of Attention class from diffusers.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        eps: float = 1e-5,
+        processor: Optional[any] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim  # if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.out_context_dim = query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.scale = dim_head**-0.5
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNormFramePack(dim_head, eps=eps)
+            self.norm_k = RMSNormFramePack(dim_head, eps=eps)
+        else:
+            raise ValueError(
+                f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
+            )
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        self.added_proj_bias = True  # added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+        else:
+            self.add_q_proj = None
+            self.add_k_proj = None
+            self.add_v_proj = None
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=True))
+            # self.to_out.append(nn.Dropout(dropout))
+            self.to_out.append(nn.Identity())  # dropout=0.0
+        else:
+            self.to_out = None
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_context_dim, bias=True)
+        else:
+            self.to_add_out = None
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "rms_norm":
+                self.norm_added_q = RMSNormFramePack(dim_head, eps=eps)
+                self.norm_added_k = RMSNormFramePack(dim_head, eps=eps)
+            else:
+                raise ValueError(f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`")
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        if processor is None:
+            processor = AttnProcessor2_0()
+        self.set_processor(processor)
+    def set_processor(self, processor: any) -> None:
+        self.processor = processor
+    def get_processor(self) -> any:
+        return self.processor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0, output_size=attention_mask.shape[0] * head_size)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1, output_size=attention_mask.shape[1] * head_size)
+        return attention_mask
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        query_dtype = query.dtype  # store dtype before potentially deleting query
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        del query, key, value, attention_mask  # free memory
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query_dtype)  # use stored dtype
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states
+# endregion diffusers
+def pad_for_3d_conv(x, kernel_size):
+    b, c, t, h, w = x.shape
+    pt, ph, pw = kernel_size
+    pad_t = (pt - (t % pt)) % pt
+    pad_h = (ph - (h % ph)) % ph
+    pad_w = (pw - (w % pw)) % pw
+    return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode="replicate")
+def center_down_sample_3d(x, kernel_size):
+    # pt, ph, pw = kernel_size
+    # cp = (pt * ph * pw) // 2
+    # xp = einops.rearrange(x, 'b c (t pt) (h ph) (w pw) -> (pt ph pw) b c t h w', pt=pt, ph=ph, pw=pw)
+    # xc = xp[cp]
+    # return xc
+    return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
+def get_cu_seqlens(text_mask, img_len):
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=text_mask.device)  # ensure device match
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def apply_rotary_emb_transposed(x, freqs_cis):
+    cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1)
+    del freqs_cis
+    x_real, x_imag = x.unflatten(-1, (-1, 2)).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    del x_real, x_imag
+    return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=None, split_attn=False):
+    if cu_seqlens_q is None and cu_seqlens_kv is None and max_seqlen_q is None and max_seqlen_kv is None:
+        if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
+            x = sageattn(q, k, v, tensor_layout="NHD")
+            return x
+        if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
+            x = flash_attn_func(q, k, v)
+            return x
+        if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
+            x = xformers_attn_func(q, k, v)
+            return x
+        x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(
+            1, 2
+        )
+        return x
+    if split_attn:
+        if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = sageattn(q[i : i + 1], k[i : i + 1], v[i : i + 1], tensor_layout="NHD")
+            return x
+        if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = flash_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+            return x
+        if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = xformers_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+            return x
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        x = torch.empty_like(q)
+        for i in range(q.size(0)):
+            x[i : i + 1] = torch.nn.functional.scaled_dot_product_attention(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+        x = x.transpose(1, 2)
+        return x
+    batch_size = q.shape[0]
+    q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
+    k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
+    v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
+    if attn_mode == "sageattn" or attn_mode is None and sageattn_varlen is not None:
+        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v  # free memory
+    elif attn_mode == "flash" or attn_mode is None and flash_attn_varlen_func is not None:
+        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v  # free memory
+    else:
+        raise NotImplementedError("No Attn Installed or batch_size > 1 is not supported in this configuration. Try `--split_attn`.")
+    x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
+    return x
+class HunyuanAttnProcessorFlashAttnDouble:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        image_rotary_emb,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ):
+        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
+        # Project image latents
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        del hidden_states  # free memory
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = apply_rotary_emb_transposed(query, image_rotary_emb)
+        key = apply_rotary_emb_transposed(key, image_rotary_emb)
+        del image_rotary_emb  # free memory
+        # Project context (text/encoder) embeddings
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+        txt_length = encoder_hidden_states.shape[1]  # store length before deleting
+        del encoder_hidden_states  # free memory
+        encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
+        encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
+        encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
+        encoder_query = attn.norm_added_q(encoder_query)
+        encoder_key = attn.norm_added_k(encoder_key)
+        # Concatenate image and context q, k, v
+        query = torch.cat([query, encoder_query], dim=1)
+        key = torch.cat([key, encoder_key], dim=1)
+        value = torch.cat([value, encoder_value], dim=1)
+        del encoder_query, encoder_key, encoder_value  # free memory
+        hidden_states_attn = attn_varlen_func(
+            query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
+        )
+        del query, key, value  # free memory
+        hidden_states_attn = hidden_states_attn.flatten(-2)
+        hidden_states, encoder_hidden_states = hidden_states_attn[:, :-txt_length], hidden_states_attn[:, -txt_length:]
+        del hidden_states_attn  # free memory
+        # Apply output projections
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)  # Dropout/Identity
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class HunyuanAttnProcessorFlashAttnSingle:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        image_rotary_emb,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ):
+        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
+        txt_length = encoder_hidden_states.shape[1]  # Store text length
+        # Concatenate image and context inputs
+        hidden_states_cat = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        del hidden_states, encoder_hidden_states  # free memory
+        # Project concatenated inputs
+        query = attn.to_q(hidden_states_cat)
+        key = attn.to_k(hidden_states_cat)
+        value = attn.to_v(hidden_states_cat)
+        del hidden_states_cat  # free memory
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = torch.cat([apply_rotary_emb_transposed(query[:, :-txt_length], image_rotary_emb), query[:, -txt_length:]], dim=1)
+        key = torch.cat([apply_rotary_emb_transposed(key[:, :-txt_length], image_rotary_emb), key[:, -txt_length:]], dim=1)
+        del image_rotary_emb  # free memory
+        hidden_states = attn_varlen_func(
+            query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
+        )
+        del query, key, value  # free memory
+        hidden_states = hidden_states.flatten(-2)
+        hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
+        return hidden_states, encoder_hidden_states
+class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, guidance, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
+        guidance_proj = self.time_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
+        time_guidance_emb = timesteps_emb + guidance_emb
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = time_guidance_emb + pooled_projections
+        return conditioning
+class CombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning
+class HunyuanVideoAdaNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
+        super().__init__()
+        out_features = out_features or 2 * in_features
+        self.linear = nn.Linear(in_features, out_features)
+        self.nonlinearity = nn.SiLU()
+    def forward(self, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        temb = self.linear(self.nonlinearity(temb))
+        gate_msa, gate_mlp = temb.chunk(2, dim=-1)
+        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
+        return gate_msa, gate_mlp
+class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.norm1 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            bias=attention_bias,
+        )
+        self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
+        self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        norm_hidden_states = self.norm1(hidden_states)
+        # Self-attention
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=attention_mask,
+        )
+        del norm_hidden_states  # free memory
+        gate_msa, gate_mlp = self.norm_out(temb)
+        hidden_states = hidden_states + attn_output * gate_msa
+        del attn_output, gate_msa  # free memory
+        ff_output = self.ff(self.norm2(hidden_states))
+        hidden_states = hidden_states + ff_output * gate_mlp
+        del ff_output, gate_mlp  # free memory
+        return hidden_states
+class HunyuanVideoIndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.refiner_blocks = nn.ModuleList(
+            [
+                HunyuanVideoIndividualTokenRefinerBlock(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    attention_bias=attention_bias,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        self_attn_mask = None
+        if attention_mask is not None:
+            batch_size = attention_mask.shape[0]
+            seq_len = attention_mask.shape[1]
+            attention_mask = attention_mask.to(hidden_states.device).bool()
+            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.refiner_blocks:
+            hidden_states = block(hidden_states, temb, self_attn_mask)
+        return hidden_states
+class HunyuanVideoTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(embedding_dim=hidden_size, pooled_projection_dim=in_channels)
+        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
+        self.token_refiner = HunyuanVideoIndividualTokenRefiner(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            num_layers=num_layers,
+            mlp_width_ratio=mlp_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            attention_bias=attention_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if attention_mask is None:
+            pooled_projections = hidden_states.mean(dim=1)
+        else:
+            original_dtype = hidden_states.dtype
+            mask_float = attention_mask.float().unsqueeze(-1)
+            pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+            pooled_projections = pooled_projections.to(original_dtype)
+        temb = self.time_text_embed(timestep, pooled_projections)
+        del pooled_projections  # free memory
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
+        del temb, attention_mask  # free memory
+        return hidden_states
+class HunyuanVideoRotaryPosEmbed(nn.Module):
+    def __init__(self, rope_dim, theta):
+        super().__init__()
+        self.DT, self.DY, self.DX = rope_dim
+        self.theta = theta
+    @torch.no_grad()
+    def get_frequency(self, dim, pos):
+        T, H, W = pos.shape
+        freqs = 1.0 / (self.theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device)[: (dim // 2)] / dim))
+        freqs = torch.outer(freqs, pos.reshape(-1)).unflatten(-1, (T, H, W)).repeat_interleave(2, dim=0)
+        return freqs.cos(), freqs.sin()
+    @torch.no_grad()
+    def forward_inner(self, frame_indices, height, width, device):
+        GT, GY, GX = torch.meshgrid(
+            frame_indices.to(device=device, dtype=torch.float32),
+            torch.arange(0, height, device=device, dtype=torch.float32),
+            torch.arange(0, width, device=device, dtype=torch.float32),
+            indexing="ij",
+        )
+        FCT, FST = self.get_frequency(self.DT, GT)
+        del GT  # free memory
+        FCY, FSY = self.get_frequency(self.DY, GY)
+        del GY  # free memory
+        FCX, FSX = self.get_frequency(self.DX, GX)
+        del GX  # free memory
+        result = torch.cat([FCT, FCY, FCX, FST, FSY, FSX], dim=0)
+        del FCT, FCY, FCX, FST, FSY, FSX  # free memory
+        # Return result already on the correct device
+        return result  # Shape (2 * total_dim / 2, T, H, W) -> (total_dim, T, H, W)
+    @torch.no_grad()
+    def forward(self, frame_indices, height, width, device):
+        frame_indices = frame_indices.unbind(0)
+        results = [self.forward_inner(f, height, width, device) for f in frame_indices]
+        results = torch.stack(results, dim=0)
+        return results
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(
+        self, x: torch.Tensor, emb: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1)
+        x = self.norm(x) * (1 + scale_msa) + shift_msa
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class AdaLayerNormZeroSingle(nn.Module):
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1)
+        x = self.norm(x) * (1 + scale_msa) + shift_msa
+        return x, gate_msa
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        scale, shift = emb.chunk(2, dim=-1)
+        del emb  # free memory
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class HunyuanVideoSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 4.0,
+        qk_norm: str = "rms_norm",
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        mlp_dim = int(hidden_size * mlp_ratio)
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        # Attention layer (pre_only=True means no output projection in Attention module itself)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            bias=True,
+            processor=HunyuanAttnProcessorFlashAttnSingle(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+            pre_only=True,  # Crucial: Attn processor will return raw attention output
+        )
+        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
+        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.shape[1]
+        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        del encoder_hidden_states  # free memory
+        residual = hidden_states
+        # 1. Input normalization
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        norm_hidden_states, norm_encoder_hidden_states = (
+            norm_hidden_states[:, :-text_seq_length, :],
+            norm_hidden_states[:, -text_seq_length:, :],
+        )
+        # 2. Attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=image_rotary_emb,
+            attn_mode=self.attn_mode,
+            split_attn=self.split_attn,
+        )
+        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
+        del norm_hidden_states, norm_encoder_hidden_states, context_attn_output  # free memory
+        del image_rotary_emb
+        # 3. Modulation and residual connection
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        del attn_output, mlp_hidden_states  # free memory
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, :-text_seq_length, :],
+            hidden_states[:, -text_seq_length:, :],
+        )
+        return hidden_states, encoder_hidden_states
+class HunyuanVideoTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float,
+        qk_norm: str = "rms_norm",
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            added_kv_proj_dim=hidden_size,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            context_pre_only=False,
+            bias=True,
+            processor=HunyuanAttnProcessorFlashAttnDouble(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+        )
+        self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+        self.norm2_context = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Input normalization
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        # 2. Joint attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=freqs_cis,
+            attn_mode=self.attn_mode,
+            split_attn=self.split_attn,
+        )
+        del norm_hidden_states, norm_encoder_hidden_states, freqs_cis  # free memory
+        # 3. Modulation and residual connection
+        hidden_states = hidden_states + attn_output * gate_msa
+        del attn_output, gate_msa  # free memory
+        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa
+        del context_attn_output, c_gate_msa  # free memory
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        del shift_mlp, scale_mlp  # free memory
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
+        del c_shift_mlp, c_scale_mlp  # free memory
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+        del norm_hidden_states  # free memory
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        del norm_encoder_hidden_states  # free memory
+        hidden_states = hidden_states + gate_mlp * ff_output
+        del ff_output, gate_mlp  # free memory
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
+        del context_ff_output, c_gate_mlp  # free memory
+        return hidden_states, encoder_hidden_states
+class ClipVisionProjection(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.up = nn.Linear(in_channels, out_channels * 3)
+        self.down = nn.Linear(out_channels * 3, out_channels)
+    def forward(self, x):
+        projected_x = self.down(nn.functional.silu(self.up(x)))
+        return projected_x
+class HunyuanVideoPatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_chans, embed_dim):
+        super().__init__()
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+class HunyuanVideoPatchEmbedForCleanLatents(nn.Module):
+    def __init__(self, inner_dim):
+        super().__init__()
+        self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+        self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+        self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+    @torch.no_grad()
+    def initialize_weight_from_another_conv3d(self, another_layer):
+        weight = another_layer.weight.detach().clone()
+        bias = another_layer.bias.detach().clone()
+        sd = {
+            "proj.weight": weight.clone(),
+            "proj.bias": bias.clone(),
+            "proj_2x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=2, hk=2, wk=2) / 8.0,
+            "proj_2x.bias": bias.clone(),
+            "proj_4x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=4, hk=4, wk=4) / 64.0,
+            "proj_4x.bias": bias.clone(),
+        }
+        sd = {k: v.clone() for k, v in sd.items()}
+        self.load_state_dict(sd)
+        return
+class HunyuanVideoTransformer3DModelPacked(nn.Module):  # (PreTrainedModelMixin, GenerationMixin,
+    # ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    # @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        num_attention_heads: int = 24,
+        attention_head_dim: int = 128,
+        num_layers: int = 20,
+        num_single_layers: int = 40,
+        num_refiner_layers: int = 2,
+        mlp_ratio: float = 4.0,
+        patch_size: int = 2,
+        patch_size_t: int = 1,
+        qk_norm: str = "rms_norm",
+        guidance_embeds: bool = True,
+        text_embed_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        rope_theta: float = 256.0,
+        rope_axes_dim: Tuple[int] = (16, 56, 56),
+        has_image_proj=False,
+        image_proj_dim=1152,
+        has_clean_x_embedder=False,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+        self.config_patch_size = patch_size
+        self.config_patch_size_t = patch_size_t
+        # 1. Latent and condition embedders
+        self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+        self.context_embedder = HunyuanVideoTokenRefiner(
+            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
+        )
+        self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
+        self.clean_x_embedder = None
+        self.image_projection = None
+        # 2. RoPE
+        self.rope = HunyuanVideoRotaryPosEmbed(rope_axes_dim, rope_theta)
+        # 3. Dual stream transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideoTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Single stream transformer blocks
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideoSingleTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        # 5. Output projection
+        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
+        self.inner_dim = inner_dim
+        self.use_gradient_checkpointing = False
+        self.enable_teacache = False
+        # if has_image_proj:
+        #     self.install_image_projection(image_proj_dim)
+        self.image_projection = ClipVisionProjection(in_channels=image_proj_dim, out_channels=self.inner_dim)
+        # self.config["has_image_proj"] = True
+        # self.config["image_proj_dim"] = in_channels
+        # if has_clean_x_embedder:
+        #     self.install_clean_x_embedder()
+        self.clean_x_embedder = HunyuanVideoPatchEmbedForCleanLatents(self.inner_dim)
+        # self.config["has_clean_x_embedder"] = True
+        self.high_quality_fp32_output_for_inference = True  # False # change default to True
+        # Block swapping attributes (initialized to None)
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = True
+        print("Gradient checkpointing enabled for HunyuanVideoTransformer3DModelPacked.")  # Logging
+    def disable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = False
+        print("Gradient checkpointing disabled for HunyuanVideoTransformer3DModelPacked.")  # Logging
+    def initialize_teacache(self, enable_teacache=True, num_steps=25, rel_l1_thresh=0.15):
+        self.enable_teacache = enable_teacache
+        self.cnt = 0
+        self.num_steps = num_steps
+        self.rel_l1_thresh = rel_l1_thresh  # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.previous_residual = None
+        self.teacache_rescale_func = np.poly1d([7.33226126e02, -4.01131952e02, 6.75869174e01, -3.14987800e00, 9.61237896e-02])
+        if enable_teacache:
+            print(f"TeaCache enabled: num_steps={num_steps}, rel_l1_thresh={rel_l1_thresh}")
+        else:
+            print("TeaCache disabled.")
+    def gradient_checkpointing_method(self, block, *args):
+        if self.use_gradient_checkpointing:
+            result = torch.utils.checkpoint.checkpoint(block, *args, use_reentrant=False)
+        else:
+            result = block(*args)
+        return result
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = num_blocks
+        self.num_double_blocks = len(self.transformer_blocks)
+        self.num_single_blocks = len(self.single_transformer_blocks)
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
+        assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
+            f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = ModelOffloader(
+            "double",
+            self.transformer_blocks,
+            self.num_double_blocks,
+            double_blocks_to_swap,
+            supports_backward,
+            device,
+            # debug=True # Optional debugging
+        )
+        self.offloader_single = ModelOffloader(
+            "single",
+            self.single_transformer_blocks,
+            self.num_single_blocks,
+            single_blocks_to_swap,
+            supports_backward,
+            device,  # , debug=True
+        )
+        print(
+            f"HunyuanVideoTransformer3DModelPacked: Block swap enabled. Swapping {num_blocks} blocks, "
+            + f"double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}, supports_backward: {supports_backward}."
+        )
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap and self.blocks_to_swap > 0:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward only.")
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap and self.blocks_to_swap > 0:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward and backward.")
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            saved_double_blocks = self.transformer_blocks
+            saved_single_blocks = self.single_transformer_blocks
+            self.transformer_blocks = None
+            self.single_transformer_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.transformer_blocks = saved_double_blocks
+            self.single_transformer_blocks = saved_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.transformer_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_transformer_blocks)
+    def process_input_hidden_states(
+        self,
+        latents,
+        latent_indices=None,
+        clean_latents=None,
+        clean_latent_indices=None,
+        clean_latents_2x=None,
+        clean_latent_2x_indices=None,
+        clean_latents_4x=None,
+        clean_latent_4x_indices=None,
+    ):
+        hidden_states = self.gradient_checkpointing_method(self.x_embedder.proj, latents)
+        B, C, T, H, W = hidden_states.shape
+        if latent_indices is None:
+            latent_indices = torch.arange(0, T).unsqueeze(0).expand(B, -1)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        rope_freqs = self.rope(frame_indices=latent_indices, height=H, width=W, device=hidden_states.device)
+        rope_freqs = rope_freqs.flatten(2).transpose(1, 2)
+        if clean_latents is not None and clean_latent_indices is not None:
+            clean_latents = clean_latents.to(hidden_states)
+            clean_latents = self.gradient_checkpointing_method(self.clean_x_embedder.proj, clean_latents)
+            clean_latents = clean_latents.flatten(2).transpose(1, 2)
+            clean_latent_rope_freqs = self.rope(frame_indices=clean_latent_indices, height=H, width=W, device=clean_latents.device)
+            clean_latent_rope_freqs = clean_latent_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_rope_freqs, rope_freqs], dim=1)
+        if clean_latents_2x is not None and clean_latent_2x_indices is not None:
+            clean_latents_2x = clean_latents_2x.to(hidden_states)
+            clean_latents_2x = pad_for_3d_conv(clean_latents_2x, (2, 4, 4))
+            clean_latents_2x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_2x, clean_latents_2x)
+            clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2)
+            clean_latent_2x_rope_freqs = self.rope(
+                frame_indices=clean_latent_2x_indices, height=H, width=W, device=clean_latents_2x.device
+            )
+            clean_latent_2x_rope_freqs = pad_for_3d_conv(clean_latent_2x_rope_freqs, (2, 2, 2))
+            clean_latent_2x_rope_freqs = center_down_sample_3d(clean_latent_2x_rope_freqs, (2, 2, 2))
+            clean_latent_2x_rope_freqs = clean_latent_2x_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents_2x, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_2x_rope_freqs, rope_freqs], dim=1)
+        if clean_latents_4x is not None and clean_latent_4x_indices is not None:
+            clean_latents_4x = clean_latents_4x.to(hidden_states)
+            clean_latents_4x = pad_for_3d_conv(clean_latents_4x, (4, 8, 8))
+            clean_latents_4x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_4x, clean_latents_4x)
+            clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2)
+            clean_latent_4x_rope_freqs = self.rope(
+                frame_indices=clean_latent_4x_indices, height=H, width=W, device=clean_latents_4x.device
+            )
+            clean_latent_4x_rope_freqs = pad_for_3d_conv(clean_latent_4x_rope_freqs, (4, 4, 4))
+            clean_latent_4x_rope_freqs = center_down_sample_3d(clean_latent_4x_rope_freqs, (4, 4, 4))
+            clean_latent_4x_rope_freqs = clean_latent_4x_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents_4x, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_4x_rope_freqs, rope_freqs], dim=1)
+        return hidden_states, rope_freqs
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        pooled_projections,
+        guidance,
+        latent_indices=None,
+        clean_latents=None,
+        clean_latent_indices=None,
+        clean_latents_2x=None,
+        clean_latent_2x_indices=None,
+        clean_latents_4x=None,
+        clean_latent_4x_indices=None,
+        image_embeddings=None,
+        attention_kwargs=None,
+        return_dict=True,
+    ):
+        if attention_kwargs is None:
+            attention_kwargs = {}
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p, p_t = self.config_patch_size, self.config_patch_size_t
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p
+        post_patch_width = width // p
+        original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
+        hidden_states, rope_freqs = self.process_input_hidden_states(
+            hidden_states,
+            latent_indices,
+            clean_latents,
+            clean_latent_indices,
+            clean_latents_2x,
+            clean_latent_2x_indices,
+            clean_latents_4x,
+            clean_latent_4x_indices,
+        )
+        del (
+            latent_indices,
+            clean_latents,
+            clean_latent_indices,
+            clean_latents_2x,
+            clean_latent_2x_indices,
+            clean_latents_4x,
+            clean_latent_4x_indices,
+        )  # free memory
+        temb = self.gradient_checkpointing_method(self.time_text_embed, timestep, guidance, pooled_projections)
+        encoder_hidden_states = self.gradient_checkpointing_method(
+            self.context_embedder, encoder_hidden_states, timestep, encoder_attention_mask
+        )
+        if self.image_projection is not None:
+            assert image_embeddings is not None, "You must use image embeddings!"
+            extra_encoder_hidden_states = self.gradient_checkpointing_method(self.image_projection, image_embeddings)
+            extra_attention_mask = torch.ones(
+                (batch_size, extra_encoder_hidden_states.shape[1]),
+                dtype=encoder_attention_mask.dtype,
+                device=encoder_attention_mask.device,
+            )
+            # must cat before (not after) encoder_hidden_states, due to attn masking
+            encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
+            encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
+            del extra_encoder_hidden_states, extra_attention_mask  # free memory
+        with torch.no_grad():
+            if batch_size == 1:
+                # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
+                # If they are not same, then their impls are wrong. Ours are always the correct one.
+                text_len = encoder_attention_mask.sum().item()
+                encoder_hidden_states = encoder_hidden_states[:, :text_len]
+                attention_mask = None, None, None, None
+            else:
+                img_seq_len = hidden_states.shape[1]
+                txt_seq_len = encoder_hidden_states.shape[1]
+                cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
+                cu_seqlens_kv = cu_seqlens_q
+                max_seqlen_q = img_seq_len + txt_seq_len
+                max_seqlen_kv = max_seqlen_q
+                attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
+                del cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv  # free memory
+        del encoder_attention_mask  # free memory
+        if self.enable_teacache:
+            modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
+            if self.cnt == 0 or self.cnt == self.num_steps - 1:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+            else:
+                curr_rel_l1 = (
+                    ((modulated_inp - self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean())
+                    .cpu()
+                    .item()
+                )
+                self.accumulated_rel_l1_distance += self.teacache_rescale_func(curr_rel_l1)
+                should_calc = self.accumulated_rel_l1_distance >= self.rel_l1_thresh
+                if should_calc:
+                    self.accumulated_rel_l1_distance = 0
+            self.previous_modulated_input = modulated_inp
+            self.cnt += 1
+            if self.cnt == self.num_steps:
+                self.cnt = 0
+            if not should_calc:
+                hidden_states = hidden_states + self.previous_residual
+            else:
+                ori_hidden_states = hidden_states.clone()
+                for block_id, block in enumerate(self.transformer_blocks):
+                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                        block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                    )
+                for block_id, block in enumerate(self.single_transformer_blocks):
+                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                        block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                    )
+                self.previous_residual = hidden_states - ori_hidden_states
+                del ori_hidden_states  # free memory
+        else:
+            for block_id, block in enumerate(self.transformer_blocks):
+                if self.blocks_to_swap:
+                    self.offloader_double.wait_for_block(block_id)
+                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                    block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                )
+                if self.blocks_to_swap:
+                    self.offloader_double.submit_move_blocks_forward(self.transformer_blocks, block_id)
+            for block_id, block in enumerate(self.single_transformer_blocks):
+                if self.blocks_to_swap:
+                    self.offloader_single.wait_for_block(block_id)
+                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                    block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                )
+                if self.blocks_to_swap:
+                    self.offloader_single.submit_move_blocks_forward(self.single_transformer_blocks, block_id)
+        del attention_mask, rope_freqs  # free memory
+        del encoder_hidden_states  # free memory
+        hidden_states = self.gradient_checkpointing_method(self.norm_out, hidden_states, temb)
+        hidden_states = hidden_states[:, -original_context_length:, :]
+        if self.high_quality_fp32_output_for_inference:
+            hidden_states = hidden_states.to(dtype=torch.float32)
+            if self.proj_out.weight.dtype != torch.float32:
+                self.proj_out.to(dtype=torch.float32)
+        hidden_states = self.gradient_checkpointing_method(self.proj_out, hidden_states)
+        hidden_states = einops.rearrange(
+            hidden_states,
+            "b (t h w) (c pt ph pw) -> b c (t pt) (h ph) (w pw)",
+            t=post_patch_num_frames,
+            h=post_patch_height,
+            w=post_patch_width,
+            pt=p_t,
+            ph=p,
+            pw=p,
+        )
+        if return_dict:
+            # return Transformer2DModelOutput(sample=hidden_states)
+            return SimpleNamespace(sample=hidden_states)
+        return (hidden_states,)
+    def fp8_optimization(
+        self, state_dict: dict[str, torch.Tensor], device: torch.device, move_to_device: bool, use_scaled_mm: bool = False
+    ) -> dict[str, torch.Tensor]:  # Return type hint added
+        """
+        Optimize the model state_dict with fp8.
+        Args:
+            state_dict (dict[str, torch.Tensor]):
+                The state_dict of the model.
+            device (torch.device):
+                The device to calculate the weight.
+            move_to_device (bool):
+                Whether to move the weight to the device after optimization.
+            use_scaled_mm (bool):
+                Whether to use scaled matrix multiplication for FP8.
+        """
+        TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"]
+        EXCLUDE_KEYS = ["norm"]  # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8
+        # inplace optimization
+        state_dict = optimize_state_dict_with_fp8(state_dict, device, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=move_to_device)
+        # apply monkey patching
+        apply_fp8_monkey_patch(self, state_dict, use_scaled_mm=use_scaled_mm)
+        return state_dict
+def load_packed_model(
+    device: Union[str, torch.device],
+    dit_path: str,
+    attn_mode: str,
+    loading_device: Union[str, torch.device],
+    fp8_scaled: bool = False,
+    split_attn: bool = False,
+) -> HunyuanVideoTransformer3DModelPacked:
+    # TODO support split_attn
+    device = torch.device(device)
+    loading_device = torch.device(loading_device)
+    if os.path.isdir(dit_path):
+        # we don't support from_pretrained for now, so loading safetensors directly
+        safetensor_files = glob.glob(os.path.join(dit_path, "*.safetensors"))
+        if len(safetensor_files) == 0:
+            raise ValueError(f"Cannot find safetensors file in {dit_path}")
+        # sort by name and take the first one
+        safetensor_files.sort()
+        dit_path = safetensor_files[0]
+    with init_empty_weights():
+        logger.info(f"Creating HunyuanVideoTransformer3DModelPacked")
+        model = HunyuanVideoTransformer3DModelPacked(
+            attention_head_dim=128,
+            guidance_embeds=True,
+            has_clean_x_embedder=True,
+            has_image_proj=True,
+            image_proj_dim=1152,
+            in_channels=16,
+            mlp_ratio=4.0,
+            num_attention_heads=24,
+            num_layers=20,
+            num_refiner_layers=2,
+            num_single_layers=40,
+            out_channels=16,
+            patch_size=2,
+            patch_size_t=1,
+            pooled_projection_dim=768,
+            qk_norm="rms_norm",
+            rope_axes_dim=(16, 56, 56),
+            rope_theta=256.0,
+            text_embed_dim=4096,
+            attn_mode=attn_mode,
+            split_attn=split_attn,
+        )
+    # if fp8_scaled, load model weights to CPU to reduce VRAM usage. Otherwise, load to the specified device (CPU for block swap or CUDA for others)
+    dit_loading_device = torch.device("cpu") if fp8_scaled else loading_device
+    logger.info(f"Loading DiT model from {dit_path}, device={dit_loading_device}")
+    # load model weights with the specified dtype or as is
+    sd = load_split_weights(dit_path, device=dit_loading_device, disable_mmap=True)
+    if fp8_scaled:
+        # fp8 optimization: calculate on CUDA, move back to CPU if loading_device is CPU (block swap)
+        logger.info(f"Optimizing model weights to fp8. This may take a while.")
+        sd = model.fp8_optimization(sd, device, move_to_device=loading_device.type == "cpu")
+        if loading_device.type != "cpu":
+            # make sure all the model weights are on the loading_device
+            logger.info(f"Moving weights to {loading_device}")
+            for key in sd.keys():
+                sd[key] = sd[key].to(loading_device)
+    info = model.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded DiT model from {dit_path}, info={info}")
+    return model

frame_pack/k_diffusion_hunyuan.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import torch
+import math
+# from diffusers_helper.k_diffusion.uni_pc_fm import sample_unipc
+# from diffusers_helper.k_diffusion.wrapper import fm_wrapper
+# from diffusers_helper.utils import repeat_to_batch_size
+from frame_pack.uni_pc_fm import sample_unipc
+from frame_pack.wrapper import fm_wrapper
+from frame_pack.utils import repeat_to_batch_size
+def flux_time_shift(t, mu=1.15, sigma=1.0):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def calculate_flux_mu(context_length, x1=256, y1=0.5, x2=4096, y2=1.15, exp_max=7.0):
+    k = (y2 - y1) / (x2 - x1)
+    b = y1 - k * x1
+    mu = k * context_length + b
+    mu = min(mu, math.log(exp_max))
+    return mu
+def get_flux_sigmas_from_mu(n, mu):
+    sigmas = torch.linspace(1, 0, steps=n + 1)
+    sigmas = flux_time_shift(sigmas, mu=mu)
+    return sigmas
+# @torch.inference_mode()
+def sample_hunyuan(
+    transformer,
+    sampler="unipc",
+    initial_latent=None,
+    concat_latent=None,
+    strength=1.0,
+    width=512,
+    height=512,
+    frames=16,
+    real_guidance_scale=1.0,
+    distilled_guidance_scale=6.0,
+    guidance_rescale=0.0,
+    shift=None,
+    num_inference_steps=25,
+    batch_size=None,
+    generator=None,
+    prompt_embeds=None,
+    prompt_embeds_mask=None,
+    prompt_poolers=None,
+    negative_prompt_embeds=None,
+    negative_prompt_embeds_mask=None,
+    negative_prompt_poolers=None,
+    dtype=torch.bfloat16,
+    device=None,
+    negative_kwargs=None,
+    callback=None,
+    **kwargs,
+):
+    device = device or transformer.device
+    if batch_size is None:
+        batch_size = int(prompt_embeds.shape[0])
+    latents = torch.randn(
+        (batch_size, 16, (frames + 3) // 4, height // 8, width // 8), generator=generator, device=generator.device
+    ).to(device=device, dtype=torch.float32)
+    B, C, T, H, W = latents.shape
+    seq_length = T * H * W // 4  # 9*80*80//4 = 14400
+    if shift is None:
+        mu = calculate_flux_mu(seq_length, exp_max=7.0)  # 1.9459... if seq_len is large, mu is clipped.
+    else:
+        mu = math.log(shift)
+    sigmas = get_flux_sigmas_from_mu(num_inference_steps, mu).to(device)
+    k_model = fm_wrapper(transformer)
+    if initial_latent is not None:
+        sigmas = sigmas * strength
+        first_sigma = sigmas[0].to(device=device, dtype=torch.float32)
+        initial_latent = initial_latent.to(device=device, dtype=torch.float32)
+        latents = initial_latent.float() * (1.0 - first_sigma) + latents.float() * first_sigma
+    if concat_latent is not None:
+        concat_latent = concat_latent.to(latents)
+    distilled_guidance = torch.tensor([distilled_guidance_scale * 1000.0] * batch_size).to(device=device, dtype=dtype)
+    prompt_embeds = repeat_to_batch_size(prompt_embeds, batch_size)
+    prompt_embeds_mask = repeat_to_batch_size(prompt_embeds_mask, batch_size)
+    prompt_poolers = repeat_to_batch_size(prompt_poolers, batch_size)
+    negative_prompt_embeds = repeat_to_batch_size(negative_prompt_embeds, batch_size)
+    negative_prompt_embeds_mask = repeat_to_batch_size(negative_prompt_embeds_mask, batch_size)
+    negative_prompt_poolers = repeat_to_batch_size(negative_prompt_poolers, batch_size)
+    concat_latent = repeat_to_batch_size(concat_latent, batch_size)
+    sampler_kwargs = dict(
+        dtype=dtype,
+        cfg_scale=real_guidance_scale,
+        cfg_rescale=guidance_rescale,
+        concat_latent=concat_latent,
+        positive=dict(
+            pooled_projections=prompt_poolers,
+            encoder_hidden_states=prompt_embeds,
+            encoder_attention_mask=prompt_embeds_mask,
+            guidance=distilled_guidance,
+            **kwargs,
+        ),
+        negative=dict(
+            pooled_projections=negative_prompt_poolers,
+            encoder_hidden_states=negative_prompt_embeds,
+            encoder_attention_mask=negative_prompt_embeds_mask,
+            guidance=distilled_guidance,
+            **(kwargs if negative_kwargs is None else {**kwargs, **negative_kwargs}),
+        ),
+    )
+    if sampler == "unipc":
+        results = sample_unipc(k_model, latents, sigmas, extra_args=sampler_kwargs, disable=False, callback=callback)
+    else:
+        raise NotImplementedError(f"Sampler {sampler} is not supported.")
+    return results

frame_pack/uni_pc_fm.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Better Flow Matching UniPC by Lvmin Zhang
+# (c) 2025
+# CC BY-SA 4.0
+# Attribution-ShareAlike 4.0 International Licence
+import torch
+from tqdm.auto import trange
+def expand_dims(v, dims):
+    return v[(...,) + (None,) * (dims - 1)]
+class FlowMatchUniPC:
+    def __init__(self, model, extra_args, variant='bh1'):
+        self.model = model
+        self.variant = variant
+        self.extra_args = extra_args
+    def model_fn(self, x, t):
+        return self.model(x, t, **self.extra_args)
+    def update_fn(self, x, model_prev_list, t_prev_list, t, order):
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = - torch.log(t_prev_0)
+        lambda_t = - torch.log(t)
+        model_prev_0 = model_prev_list[-1]
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = - torch.log(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = -h[0]
+        h_phi_1 = torch.expm1(hh)
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.variant == 'bh1':
+            B_h = hh
+        elif self.variant == 'bh2':
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError('Bad variant!')
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        use_predictor = len(D1s) > 0
+        if use_predictor:
+            D1s = torch.stack(D1s, dim=1)
+            if order == 2:
+                rhos_p = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+            rhos_p = None
+        if order == 1:
+            rhos_c = torch.tensor([0.5], device=b.device)
+        else:
+            rhos_c = torch.linalg.solve(R, b)
+        x_t_ = expand_dims(t / t_prev_0, dims) * x - expand_dims(h_phi_1, dims) * model_prev_0
+        if use_predictor:
+            pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))
+        else:
+            pred_res = 0
+        x_t = x_t_ - expand_dims(B_h, dims) * pred_res
+        model_t = self.model_fn(x_t, t)
+        if D1s is not None:
+            corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))
+        else:
+            corr_res = 0
+        D1_t = (model_t - model_prev_0)
+        x_t = x_t_ - expand_dims(B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        return x_t, model_t
+    def sample(self, x, sigmas, callback=None, disable_pbar=False):
+        order = min(3, len(sigmas) - 2)
+        model_prev_list, t_prev_list = [], []
+        for i in trange(len(sigmas) - 1, disable=disable_pbar):
+            vec_t = sigmas[i].expand(x.shape[0])
+            with torch.no_grad():
+                if i == 0:
+                    model_prev_list = [self.model_fn(x, vec_t)]
+                    t_prev_list = [vec_t]
+                elif i < order:
+                    init_order = i
+                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, init_order)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+                else:
+                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, order)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+            model_prev_list = model_prev_list[-order:]
+            t_prev_list = t_prev_list[-order:]
+            if callback is not None:
+                callback({'x': x, 'i': i, 'denoised': model_prev_list[-1]})
+        return model_prev_list[-1]
+def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
+    assert variant in ['bh1', 'bh2']
+    return FlowMatchUniPC(model, extra_args=extra_args, variant=variant).sample(noise, sigmas=sigmas, callback=callback, disable_pbar=disable)

frame_pack/utils.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import os
+import cv2
+import json
+import random
+import glob
+import torch
+import einops
+import numpy as np
+import datetime
+import torchvision
+import safetensors.torch as sf
+from PIL import Image
+def min_resize(x, m):
+    if x.shape[0] < x.shape[1]:
+        s0 = m
+        s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
+    else:
+        s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
+        s1 = m
+    new_max = max(s1, s0)
+    raw_max = max(x.shape[0], x.shape[1])
+    if new_max < raw_max:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (s1, s0), interpolation=interpolation)
+    return y
+def d_resize(x, y):
+    H, W, C = y.shape
+    new_min = min(H, W)
+    raw_min = min(x.shape[0], x.shape[1])
+    if new_min < raw_min:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (W, H), interpolation=interpolation)
+    return y
+def resize_and_center_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    original_width, original_height = pil_image.size
+    scale_factor = max(target_width / original_width, target_height / original_height)
+    resized_width = int(round(original_width * scale_factor))
+    resized_height = int(round(original_height * scale_factor))
+    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
+    left = (resized_width - target_width) / 2
+    top = (resized_height - target_height) / 2
+    right = (resized_width + target_width) / 2
+    bottom = (resized_height + target_height) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return np.array(cropped_image)
+def resize_and_center_crop_pytorch(image, target_width, target_height):
+    B, C, H, W = image.shape
+    if H == target_height and W == target_width:
+        return image
+    scale_factor = max(target_width / W, target_height / H)
+    resized_width = int(round(W * scale_factor))
+    resized_height = int(round(H * scale_factor))
+    resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode="bilinear", align_corners=False)
+    top = (resized_height - target_height) // 2
+    left = (resized_width - target_width) // 2
+    cropped = resized[:, :, top : top + target_height, left : left + target_width]
+    return cropped
+def resize_without_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+def just_crop(image, w, h):
+    if h == image.shape[0] and w == image.shape[1]:
+        return image
+    original_height, original_width = image.shape[:2]
+    k = min(original_height / h, original_width / w)
+    new_width = int(round(w * k))
+    new_height = int(round(h * k))
+    x_start = (original_width - new_width) // 2
+    y_start = (original_height - new_height) // 2
+    cropped_image = image[y_start : y_start + new_height, x_start : x_start + new_width]
+    return cropped_image
+def write_to_json(data, file_path):
+    temp_file_path = file_path + ".tmp"
+    with open(temp_file_path, "wt", encoding="utf-8") as temp_file:
+        json.dump(data, temp_file, indent=4)
+    os.replace(temp_file_path, file_path)
+    return
+def read_from_json(file_path):
+    with open(file_path, "rt", encoding="utf-8") as file:
+        data = json.load(file)
+    return data
+def get_active_parameters(m):
+    return {k: v for k, v in m.named_parameters() if v.requires_grad}
+def cast_training_params(m, dtype=torch.float32):
+    result = {}
+    for n, param in m.named_parameters():
+        if param.requires_grad:
+            param.data = param.to(dtype)
+            result[n] = param
+    return result
+def separate_lora_AB(parameters, B_patterns=None):
+    parameters_normal = {}
+    parameters_B = {}
+    if B_patterns is None:
+        B_patterns = [".lora_B.", "__zero__"]
+    for k, v in parameters.items():
+        if any(B_pattern in k for B_pattern in B_patterns):
+            parameters_B[k] = v
+        else:
+            parameters_normal[k] = v
+    return parameters_normal, parameters_B
+def set_attr_recursive(obj, attr, value):
+    attrs = attr.split(".")
+    for name in attrs[:-1]:
+        obj = getattr(obj, name)
+    setattr(obj, attrs[-1], value)
+    return
+def print_tensor_list_size(tensors):
+    total_size = 0
+    total_elements = 0
+    if isinstance(tensors, dict):
+        tensors = tensors.values()
+    for tensor in tensors:
+        total_size += tensor.nelement() * tensor.element_size()
+        total_elements += tensor.nelement()
+    total_size_MB = total_size / (1024**2)
+    total_elements_B = total_elements / 1e9
+    print(f"Total number of tensors: {len(tensors)}")
+    print(f"Total size of tensors: {total_size_MB:.2f} MB")
+    print(f"Total number of parameters: {total_elements_B:.3f} billion")
+    return
+@torch.no_grad()
+def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
+    batch_size = a.size(0)
+    if b is None:
+        b = torch.zeros_like(a)
+    if mask_a is None:
+        mask_a = torch.rand(batch_size) < probability_a
+    mask_a = mask_a.to(a.device)
+    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
+    result = torch.where(mask_a, a, b)
+    return result
+@torch.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@torch.no_grad()
+def supress_lower_channels(m, k, alpha=0.01):
+    data = m.weight.data.clone()
+    assert int(data.shape[1]) >= k
+    data[:, :k] = data[:, :k] * alpha
+    m.weight.data = data.contiguous().clone()
+    return m
+def freeze_module(m):
+    if not hasattr(m, "_forward_inside_frozen_module"):
+        m._forward_inside_frozen_module = m.forward
+    m.requires_grad_(False)
+    m.forward = torch.no_grad()(m.forward)
+    return m
+def get_latest_safetensors(folder_path):
+    safetensors_files = glob.glob(os.path.join(folder_path, "*.safetensors"))
+    if not safetensors_files:
+        raise ValueError("No file to resume!")
+    latest_file = max(safetensors_files, key=os.path.getmtime)
+    latest_file = os.path.abspath(os.path.realpath(latest_file))
+    return latest_file
+def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
+    tags = tags_str.split(", ")
+    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
+    prompt = ", ".join(tags)
+    return prompt
+def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
+    numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
+    edges = np.linspace(0, 1, n + 1)
+    points = np.random.uniform(edges[:-1], edges[1:])
+    numbers = inclusive + (exclusive - inclusive) * points
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def soft_append_bcthw(history, current, overlap=0):
+    if overlap <= 0:
+        return torch.cat([history, current], dim=2)
+    assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
+    assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
+    weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
+    blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
+    output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
+    return output.to(history)
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec="libx264", options={"crf": "0"})
+    # write tensor as .pt file
+    torch.save(x, output_filename.replace(".mp4", ".pt"))
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "b c t h w -> c (b h) (t w)")
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def save_bchw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "b c h w -> c h (b w)")
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def add_tensors_with_padding(tensor1, tensor2):
+    if tensor1.shape == tensor2.shape:
+        return tensor1 + tensor2
+    shape1 = tensor1.shape
+    shape2 = tensor2.shape
+    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
+    padded_tensor1 = torch.zeros(new_shape)
+    padded_tensor2 = torch.zeros(new_shape)
+    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
+    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
+    result = padded_tensor1 + padded_tensor2
+    return result
+def print_free_mem():
+    torch.cuda.empty_cache()
+    free_mem, total_mem = torch.cuda.mem_get_info(0)
+    free_mem_mb = free_mem / (1024**2)
+    total_mem_mb = total_mem / (1024**2)
+    print(f"Free memory: {free_mem_mb:.2f} MB")
+    print(f"Total memory: {total_mem_mb:.2f} MB")
+    return
+def print_gpu_parameters(device, state_dict, log_count=1):
+    summary = {"device": device, "keys_count": len(state_dict)}
+    logged_params = {}
+    for i, (key, tensor) in enumerate(state_dict.items()):
+        if i >= log_count:
+            break
+        logged_params[key] = tensor.flatten()[:3].tolist()
+    summary["params"] = logged_params
+    print(str(summary))
+    return
+def visualize_txt_as_img(width, height, text, font_path="font/DejaVuSans.ttf", size=18):
+    from PIL import Image, ImageDraw, ImageFont
+    txt = Image.new("RGB", (width, height), color="white")
+    draw = ImageDraw.Draw(txt)
+    font = ImageFont.truetype(font_path, size=size)
+    if text == "":
+        return np.array(txt)
+    # Split text into lines that fit within the image width
+    lines = []
+    words = text.split()
+    current_line = words[0]
+    for word in words[1:]:
+        line_with_word = f"{current_line} {word}"
+        if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
+            current_line = line_with_word
+        else:
+            lines.append(current_line)
+            current_line = word
+    lines.append(current_line)
+    # Draw the text line by line
+    y = 0
+    line_height = draw.textbbox((0, 0), "A", font=font)[3]
+    for line in lines:
+        if y + line_height > height:
+            break  # stop drawing if the next line will be outside the image
+        draw.text((0, y), line, fill="black", font=font)
+        y += line_height
+    return np.array(txt)
+def blue_mark(x):
+    x = x.copy()
+    c = x[:, :, 2]
+    b = cv2.blur(c, (9, 9))
+    x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
+    return x
+def green_mark(x):
+    x = x.copy()
+    x[:, :, 2] = -1
+    x[:, :, 0] = -1
+    return x
+def frame_mark(x):
+    x = x.copy()
+    x[:64] = -1
+    x[-64:] = -1
+    x[:, :8] = 1
+    x[:, -8:] = 1
+    return x
+@torch.inference_mode()
+def pytorch2numpy(imgs):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        y = y * 127.5 + 127.5
+        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        results.append(y)
+    return results
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
+    h = h.movedim(-1, 1)
+    return h
+@torch.no_grad()
+def duplicate_prefix_to_suffix(x, count, zero_out=False):
+    if zero_out:
+        return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
+    else:
+        return torch.cat([x, x[:count]], dim=0)
+def weighted_mse(a, b, weight):
+    return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
+def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
+    x = (x - x_min) / (x_max - x_min)
+    x = max(0.0, min(x, 1.0))
+    x = x**sigma
+    return y_min + x * (y_max - y_min)
+def expand_to_dims(x, target_dims):
+    return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
+def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
+    if tensor is None:
+        return None
+    first_dim = tensor.shape[0]
+    if first_dim == batch_size:
+        return tensor
+    if batch_size % first_dim != 0:
+        raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
+    repeat_times = batch_size // first_dim
+    return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
+def dim5(x):
+    return expand_to_dims(x, 5)
+def dim4(x):
+    return expand_to_dims(x, 4)
+def dim3(x):
+    return expand_to_dims(x, 3)
+def crop_or_pad_yield_mask(x, length):
+    B, F, C = x.shape
+    device = x.device
+    dtype = x.dtype
+    if F < length:
+        y = torch.zeros((B, length, C), dtype=dtype, device=device)
+        mask = torch.zeros((B, length), dtype=torch.bool, device=device)
+        y[:, :F, :] = x
+        mask[:, :F] = True
+        return y, mask
+    return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
+def extend_dim(x, dim, minimal_length, zero_pad=False):
+    original_length = int(x.shape[dim])
+    if original_length >= minimal_length:
+        return x
+    if zero_pad:
+        padding_shape = list(x.shape)
+        padding_shape[dim] = minimal_length - original_length
+        padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
+    else:
+        idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
+        last_element = x[idx]
+        padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
+    return torch.cat([x, padding], dim=dim)
+def lazy_positional_encoding(t, repeats=None):
+    if not isinstance(t, list):
+        t = [t]
+    from diffusers.models.embeddings import get_timestep_embedding
+    te = torch.tensor(t)
+    te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
+    if repeats is None:
+        return te
+    te = te[:, None, :].expand(-1, repeats, -1)
+    return te
+def state_dict_offset_merge(A, B, C=None):
+    result = {}
+    keys = A.keys()
+    for key in keys:
+        A_value = A[key]
+        B_value = B[key].to(A_value)
+        if C is None:
+            result[key] = A_value + B_value
+        else:
+            C_value = C[key].to(A_value)
+            result[key] = A_value + B_value - C_value
+    return result
+def state_dict_weighted_merge(state_dicts, weights):
+    if len(state_dicts) != len(weights):
+        raise ValueError("Number of state dictionaries must match number of weights")
+    if not state_dicts:
+        return {}
+    total_weight = sum(weights)
+    if total_weight == 0:
+        raise ValueError("Sum of weights cannot be zero")
+    normalized_weights = [w / total_weight for w in weights]
+    keys = state_dicts[0].keys()
+    result = {}
+    for key in keys:
+        result[key] = state_dicts[0][key] * normalized_weights[0]
+        for i in range(1, len(state_dicts)):
+            state_dict_value = state_dicts[i][key].to(result[key])
+            result[key] += state_dict_value * normalized_weights[i]
+    return result
+def group_files_by_folder(all_files):
+    grouped_files = {}
+    for file in all_files:
+        folder_name = os.path.basename(os.path.dirname(file))
+        if folder_name not in grouped_files:
+            grouped_files[folder_name] = []
+        grouped_files[folder_name].append(file)
+    list_of_lists = list(grouped_files.values())
+    return list_of_lists
+def generate_timestamp():
+    now = datetime.datetime.now()
+    timestamp = now.strftime("%y%m%d_%H%M%S")
+    milliseconds = f"{int(now.microsecond / 1000):03d}"
+    random_number = random.randint(0, 9999)
+    return f"{timestamp}_{milliseconds}_{random_number}"
+def write_PIL_image_with_png_info(image, metadata, path):
+    from PIL.PngImagePlugin import PngInfo
+    png_info = PngInfo()
+    for key, value in metadata.items():
+        png_info.add_text(key, value)
+    image.save(path, "PNG", pnginfo=png_info)
+    return image
+def torch_safe_save(content, path):
+    torch.save(content, path + "_tmp")
+    os.replace(path + "_tmp", path)
+    return path
+def move_optimizer_to_device(optimizer, device):
+    for state in optimizer.state.values():
+        for k, v in state.items():
+            if isinstance(v, torch.Tensor):
+                state[k] = v.to(device)

frame_pack/wrapper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+def append_dims(x, target_dims):
+    return x[(...,) + (None,) * (target_dims - x.ndim)]
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=1.0):
+    if guidance_rescale == 0:
+        return noise_cfg
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1.0 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def fm_wrapper(transformer, t_scale=1000.0):
+    def k_model(x, sigma, **extra_args):
+        dtype = extra_args['dtype']
+        cfg_scale = extra_args['cfg_scale']
+        cfg_rescale = extra_args['cfg_rescale']
+        concat_latent = extra_args['concat_latent']
+        original_dtype = x.dtype
+        sigma = sigma.float()
+        x = x.to(dtype)
+        timestep = (sigma * t_scale).to(dtype)
+        if concat_latent is None:
+            hidden_states = x
+        else:
+            hidden_states = torch.cat([x, concat_latent.to(x)], dim=1)
+        pred_positive = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['positive'])[0].float()
+        if cfg_scale == 1.0:
+            pred_negative = torch.zeros_like(pred_positive)
+        else:
+            pred_negative = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['negative'])[0].float()
+        pred_cfg = pred_negative + cfg_scale * (pred_positive - pred_negative)
+        pred = rescale_noise_cfg(pred_cfg, pred_positive, guidance_rescale=cfg_rescale)
+        x0 = x.float() - pred.float() * append_dims(sigma, x.ndim)
+        return x0.to(dtype=original_dtype)
+    return k_model

framepack_edit_output/framepack-edit-lora-000001.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5a6478224e15dd49359bb791f4d1984d4f87b2b69e858784a32266d4a9b270c
+size 275426304

framepack_edit_output/framepack-edit-lora-000002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b24eefda91054ca54f70c9d50eb2df47a1954c4ddf2f3f12078d67e8a97a767
+size 275426304

framepack_edit_output/framepack-edit-lora-000003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9c0e9747f651655dd95dd13f1c2999662a48dc3e89c84537ec7dc88ec1b307f
+size 275426304

framepack_edit_output/framepack-edit-lora-000004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7456a21c9cbf4bcf4ddcf2e8aacff7d90dc96c1dbaa1f802bb32dbd9e38bbb9b
+size 275426304

framepack_edit_output/framepack-edit-lora-000005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1d152090aafda957a8ab146ab183aed9b2ddeed70e9fc003163d59024f7e3d6
+size 275426304

framepack_edit_output/framepack-edit-lora-000006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30bed80789e6ea6d3b5749e86b299889a9d3282758862a5cacf69c66f49c89a5
+size 275426304

hunyuan_model/__init__.py ADDED Viewed

File without changes

hunyuan_model/activation_layers.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+def get_activation_layer(act_type):
+    """get activation layer
+    Args:
+        act_type (str): the activation type
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        # Approximate `tanh` requires torch >= 1.13
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")

hunyuan_model/attention.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import importlib.metadata
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import _flash_attn_forward
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    from flash_attn.flash_attn_interface import flash_attn_func
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+    flash_attn_func = None
+try:
+    print(f"Trying to import sageattention")
+    from sageattention import sageattn_varlen, sageattn
+    print("Successfully imported sageattention")
+except ImportError:
+    print(f"Failed to import sageattention")
+    sageattn_varlen = None
+    sageattn = None
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+MEMORY_LAYOUT = {
+    "flash": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "flash_fixlen": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "sageattn": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "sageattn_fixlen": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "xformers": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+def get_cu_seqlens(text_mask, img_len):
+    """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
+    Args:
+        text_mask (torch.Tensor): the mask of text
+        img_len (int): the length of image
+    Returns:
+        torch.Tensor: the calculated cu_seqlens for flash attention
+    """
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def attention(
+    q_or_qkv_list,
+    k=None,
+    v=None,
+    mode="flash",
+    drop_rate=0,
+    attn_mask=None,
+    total_len=None,
+    causal=False,
+    cu_seqlens_q=None,
+    cu_seqlens_kv=None,
+    max_seqlen_q=None,
+    max_seqlen_kv=None,
+    batch_size=1,
+):
+    """
+    Perform QKV self attention.
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_q (int): The maximum sequence length in the batch of q.
+        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    q, k, v = q_or_qkv_list if type(q_or_qkv_list) == list else (q_or_qkv_list, k, v)
+    if type(q_or_qkv_list) == list:
+        q_or_qkv_list.clear()
+    split_attn = total_len is not None
+    if split_attn and mode == "sageattn":
+        mode = "sageattn_fixlen"
+    elif split_attn and mode == "flash":
+        mode = "flash_fixlen"
+    # print(f"Attention mode: {mode}, split_attn: {split_attn}")
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+    # trim the sequence length to the actual length instead of attn_mask
+    if split_attn:
+        trimmed_len = q.shape[1] - total_len
+        q = [q[i : i + 1, : total_len[i]] for i in range(len(q))]
+        k = [k[i : i + 1, : total_len[i]] for i in range(len(k))]
+        v = [v[i : i + 1, : total_len[i]] for i in range(len(v))]
+        q = [pre_attn_layout(q_i) for q_i in q]
+        k = [pre_attn_layout(k_i) for k_i in k]
+        v = [pre_attn_layout(v_i) for v_i in v]
+        # print(
+        #     f"Trimming the sequence length to {total_len},trimmed_len: {trimmed_len}, q.shape: {[q_i.shape for q_i in q]}, mode: {mode}"
+        # )
+    else:
+        q = pre_attn_layout(q)
+        k = pre_attn_layout(k)
+        v = pre_attn_layout(v)
+    if mode == "torch":
+        if split_attn:
+            x = []
+            for i in range(len(q)):
+                x_i = F.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate, is_causal=causal)
+                q[i], k[i], v[i] = None, None, None
+                x.append(x_i)
+            del q, k, v
+        else:
+            if attn_mask is not None and attn_mask.dtype != torch.bool:
+                attn_mask = attn_mask.to(q.dtype)
+            x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
+            del q, k, v
+            del attn_mask
+    elif mode == "xformers":
+        # B, M, H, K: M is the sequence length, H is the number of heads, K is the dimension of the heads -> it is same as input dimension
+        # currently only support batch_size = 1
+        assert split_attn, "Xformers only supports splitting"
+        x = []
+        for i in range(len(q)):
+            x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate)  # , causal=causal)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "flash":
+        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
+    elif mode == "flash_fixlen":
+        x = []
+        for i in range(len(q)):
+            # q: (batch_size, seqlen, nheads, headdim), k: (batch_size, seqlen, nheads_k, headdim), v: (batch_size, seqlen, nheads_k, headdim)
+            x_i = flash_attn_func(q[i], k[i], v[i], dropout_p=drop_rate, causal=causal)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "sageattn":
+        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
+    elif mode == "sageattn_fixlen":
+        x = []
+        for i in range(len(q)):
+            # HND seems to cause an error
+            x_i = sageattn(q[i], k[i], v[i])  # (batch_size, seq_len, head_num, head_dim)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "vanilla":
+        assert not split_attn, "Vanilla attention does not support trimming"
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f"Unsupported attention mode: {mode}")
+    if split_attn:
+        x = [post_attn_layout(x_i) for x_i in x]
+        for i in range(len(x)):
+            x[i] = F.pad(x[i], (0, 0, 0, 0, 0, trimmed_len[i]))
+        x = torch.cat(x, dim=0)
+    else:
+        x = post_attn_layout(x)
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    return out
+def parallel_attention(hybrid_seq_parallel_attn, q, k, v, img_q_len, img_kv_len, cu_seqlens_q, cu_seqlens_kv):
+    attn1 = hybrid_seq_parallel_attn(
+        None,
+        q[:, :img_q_len, :, :],
+        k[:, :img_kv_len, :, :],
+        v[:, :img_kv_len, :, :],
+        dropout_p=0.0,
+        causal=False,
+        joint_tensor_query=q[:, img_q_len : cu_seqlens_q[1]],
+        joint_tensor_key=k[:, img_kv_len : cu_seqlens_kv[1]],
+        joint_tensor_value=v[:, img_kv_len : cu_seqlens_kv[1]],
+        joint_strategy="rear",
+    )
+    if flash_attn.__version__ >= "2.7.0":
+        attn2, *_ = _flash_attn_forward(
+            q[:, cu_seqlens_q[1] :],
+            k[:, cu_seqlens_kv[1] :],
+            v[:, cu_seqlens_kv[1] :],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size_left=-1,
+            window_size_right=-1,
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    else:
+        attn2, *_ = _flash_attn_forward(
+            q[:, cu_seqlens_q[1] :],
+            k[:, cu_seqlens_kv[1] :],
+            v[:, cu_seqlens_kv[1] :],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size=(-1, -1),
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    attn = torch.cat([attn1, attn2], dim=1)
+    b, s, a, d = attn.shape
+    attn = attn.reshape(b, s, -1)
+    return attn

hunyuan_model/autoencoder_kl_causal_3d.py ADDED Viewed

	@@ -0,0 +1,609 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+# try:
+#     # This diffusers is modified and packed in the mirror.
+#     from diffusers.loaders import FromOriginalVAEMixin
+# except ImportError:
+#     # Use this to be compatible with the original diffusers.
+#     from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
+@dataclass
+class DecoderOutput2(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        sample_tsize: int = 64,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+        spatial_compression_ratio: int = 8,
+        time_compression_ratio: int = 4,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__()
+        self.time_compression_ratio = time_compression_ratio
+        self.encoder = EncoderCausal3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        self.decoder = DecoderCausal3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+        self.use_slicing = False
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = self.config.sample_size[0] if isinstance(self.config.sample_size, (list, tuple)) else self.config.sample_size
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+            module.gradient_checkpointing = value
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger videos.
+        """
+        self.enable_spatial_tiling(use_tiling)
+        self.enable_temporal_tiling(use_tiling)
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.disable_spatial_tiling()
+        self.disable_temporal_tiling()
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def set_chunk_size_for_causal_conv_3d(self, chunk_size: int):
+        # set chunk_size to CausalConv3d recursively
+        def set_chunk_size(module):
+            if hasattr(module, "chunk_size"):
+                module.chunk_size = chunk_size
+        self.apply(set_chunk_size)
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images/videos into latents.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded images/videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
+        if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+            return self.temporal_tiled_encode(x, return_dict=return_dict)
+        if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.spatial_tiled_encode(x, return_dict=return_dict)
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
+        if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+            return self.temporal_tiled_decode(z, return_dict=return_dict)
+        if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.spatial_tiled_decode(z, return_dict=return_dict)
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    @apply_forward_hook
+    def decode(self, z: torch.FloatTensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images/videos.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+    def spatial_tiled_encode(
+        self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False
+    ) -> AutoencoderKLOutput:
+        r"""Encode a batch of images/videos using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], overlap_size):
+            row = []
+            for j in range(0, x.shape[-1], overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+        if return_moments:
+            return moments
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images/videos using a tiled decoder.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[-2], overlap_size):
+            row = []
+            for j in range(0, z.shape[-1], overlap_size):
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=-2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_latent_min_tsize - blend_extent
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size
+            ):
+                tile = self.spatial_tiled_encode(tile, return_moments=True)
+            else:
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+        moments = torch.cat(result_row, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        # Split z into overlapping tiles and decode them separately.
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_sample_min_tsize - blend_extent
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i : i + self.tile_latent_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size
+            ):
+                decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+            else:
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+            if i > 0:
+                decoded = decoded[:, :, 1:, :, :]
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+        dec = torch.cat(result_row, dim=2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        return_posterior: bool = False,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput2, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+        if not return_dict:
+            if return_posterior:
+                return (dec, posterior)
+            else:
+                return (dec,)
+        if return_posterior:
+            return DecoderOutput2(sample=dec, posterior=posterior)
+        else:
+            return DecoderOutput2(sample=dec)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)

hunyuan_model/embed_layers.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import collections
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from .helpers import to_2tuple
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding
+    Image to Patch Embedding using Conv2d
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+    Based on the impl in https://github.com/google-research/vision_transformer
+    Hacked together by / Copyright 2020 Ross Wightman
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+    def __init__(
+        self,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, **factory_kwargs)
+        nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+        if bias:
+            nn.init.zeros_(self.proj.bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True, **factory_kwargs)
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True, **factory_kwargs)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        act_layer,
+        frequency_embedding_size=256,
+        max_period=10000,
+        out_size=None,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb

hunyuan_model/fp8_optimization.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#based on ComfyUI's and MinusZoneAI's fp8_linear optimization
+#further borrowed from HunyuanVideoWrapper for Musubi Tuner
+import torch
+import torch.nn as nn
+def fp8_linear_forward(cls, original_dtype, input):
+    weight_dtype = cls.weight.dtype
+    if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        if len(input.shape) == 3:
+            target_dtype = torch.float8_e5m2 if weight_dtype == torch.float8_e4m3fn else torch.float8_e4m3fn
+            inn = input.reshape(-1, input.shape[2]).to(target_dtype)
+            w = cls.weight.t()
+            scale = torch.ones((1), device=input.device, dtype=torch.float32)
+            bias = cls.bias.to(original_dtype) if cls.bias is not None else None
+            if bias is not None:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, bias=bias, scale_a=scale, scale_b=scale)
+            else:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, scale_a=scale, scale_b=scale)
+            if isinstance(o, tuple):
+                o = o[0]
+            return o.reshape((-1, input.shape[1], cls.weight.shape[0]))
+        else:
+            return cls.original_forward(input.to(original_dtype))
+    else:
+        return cls.original_forward(input)
+def convert_fp8_linear(module, original_dtype, params_to_keep={}):
+    setattr(module, "fp8_matmul_enabled", True)
+    for name, module in module.named_modules():
+        if not any(keyword in name for keyword in params_to_keep):
+            if isinstance(module, nn.Linear):
+                original_forward = module.forward
+                setattr(module, "original_forward", original_forward)
+                setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))

hunyuan_model/helpers.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import collections.abc
+from itertools import repeat
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            x = tuple(x)
+            if len(x) == 1:
+                x = tuple(repeat(x[0], n))
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+def as_tuple(x):
+    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        return tuple(x)
+    if x is None or isinstance(x, (int, float, str)):
+        return (x,)
+    else:
+        raise ValueError(f"Unknown type {type(x)}")
+def as_list_of_2tuple(x):
+    x = as_tuple(x)
+    if len(x) == 1:
+        x = (x[0], x[0])
+    assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
+    lst = []
+    for i in range(0, len(x), 2):
+        lst.append((x[i], x[i + 1]))
+    return lst

hunyuan_model/mlp_layers.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+from functools import partial
+import torch
+import torch.nn as nn
+from .modulate_layers import modulate
+from .helpers import to_2tuple
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(
+            in_channels, hidden_channels, bias=bias[0], **factory_kwargs
+        )
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_channels, **factory_kwargs)
+            if norm_layer is not None
+            else nn.Identity()
+        )
+        self.fc2 = linear_layer(
+            hidden_channels, out_features, bias=bias[1], **factory_kwargs
+        )
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+#
+class MLPEmbedder(nn.Module):
+    """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
+    def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class FinalLayer(nn.Module):
+    """The final layer of DiT."""
+    def __init__(
+        self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        # Just use LayerNorm for the final layer
+        self.norm_final = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        if isinstance(patch_size, int):
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size * patch_size * out_channels,
+                bias=True,
+                **factory_kwargs
+            )
+        else:
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
+                bias=True,
+            )
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+        # Here we don't distinguish between the modulate types. Just use the simple one.
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x

hunyuan_model/models.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+import os
+from typing import Any, List, Tuple, Optional, Union, Dict
+import accelerate
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attention import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, apply_gate
+from .token_refiner import SingleTokenRefiner
+from modules.custom_offloading_utils import ModelOffloader, synchronize_device, clean_memory_on_device
+from hunyuan_model.posemb_layers import get_nd_rotary_pos_embed
+from utils.safetensors_utils import MemoryEfficientSafeOpen
+class MMDoubleStreamBlock(nn.Module):
+    """
+    A multimodal dit block with seperate modulation for
+    text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+                                     (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.deterministic = False
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.img_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.img_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.img_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.txt_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        self.txt_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.txt_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+        self.gradient_checkpointing = False
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        total_len: Optional[torch.Tensor] = None,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: tuple = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        (img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate) = self.img_mod(vec).chunk(
+            6, dim=-1
+        )
+        (txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate) = self.txt_mod(vec).chunk(
+            6, dim=-1
+        )
+        # Prepare image for attention.
+        img_modulated = self.img_norm1(img)
+        img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
+        img_qkv = self.img_attn_qkv(img_modulated)
+        img_modulated = None
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        img_qkv = None
+        # Apply QK-Norm if needed
+        img_q = self.img_attn_q_norm(img_q).to(img_v)
+        img_k = self.img_attn_k_norm(img_k).to(img_v)
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q_shape = img_q.shape
+            img_k_shape = img_k.shape
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_q.shape == img_q_shape and img_k.shape == img_k_shape
+            ), f"img_kk: {img_q.shape}, img_q: {img_q_shape}, img_kk: {img_k.shape}, img_k: {img_k_shape}"
+            # img_q, img_k = img_qq, img_kk
+        # Prepare txt for attention.
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        txt_modulated = None
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        txt_qkv = None
+        # Apply QK-Norm if needed.
+        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+        # Run actual attention.
+        img_q_len = img_q.shape[1]
+        img_kv_len = img_k.shape[1]
+        batch_size = img_k.shape[0]
+        q = torch.cat((img_q, txt_q), dim=1)
+        img_q = txt_q = None
+        k = torch.cat((img_k, txt_k), dim=1)
+        img_k = txt_k = None
+        v = torch.cat((img_v, txt_v), dim=1)
+        img_v = txt_v = None
+        assert (
+            cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
+        ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, img.shape[0]:{img.shape[0]}"
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            l = [q, k, v]
+            q = k = v = None
+            attn = attention(
+                l,
+                mode=self.attn_mode,
+                attn_mask=attn_mask,
+                total_len=total_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=batch_size,
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q_len,
+                img_kv_len=img_kv_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+            )
+        # attention computation end
+        img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+        attn = None
+        # Calculate the img bloks.
+        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+        img_attn = None
+        img = img + apply_gate(
+            self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
+            gate=img_mod2_gate,
+        )
+        # Calculate the txt bloks.
+        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+        txt_attn = None
+        txt = txt + apply_gate(
+            self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
+            gate=txt_mod2_gate,
+        )
+        return img, txt
+    # def forward(
+    #     self,
+    #     img: torch.Tensor,
+    #     txt: torch.Tensor,
+    #     vec: torch.Tensor,
+    #     attn_mask: Optional[torch.Tensor] = None,
+    #     cu_seqlens_q: Optional[torch.Tensor] = None,
+    #     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    #     max_seqlen_q: Optional[int] = None,
+    #     max_seqlen_kv: Optional[int] = None,
+    #     freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class MMSingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    Also refer to (SD3): https://arxiv.org/abs/2403.03206
+                  (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim**-0.5
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.q_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        self.k_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.mlp_act = get_activation_layer(mlp_act_type)()
+        self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.hybrid_seq_parallel_attn = None
+        self.gradient_checkpointing = False
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        attn_mask: Optional[torch.Tensor] = None,
+        total_len: Optional[torch.Tensor] = None,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    ) -> torch.Tensor:
+        mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+        x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        x_mod = None
+        # mlp = mlp.to("cpu", non_blocking=True)
+        # clean_memory_on_device(x.device)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        qkv = None
+        # Apply QK-Norm if needed.
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+            img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+            q = k = None
+            img_q_shape = img_q.shape
+            img_k_shape = img_k.shape
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_q.shape == img_q_shape and img_k_shape == img_k.shape
+            ), f"img_kk: {img_q.shape}, img_q: {img_q.shape}, img_kk: {img_k.shape}, img_k: {img_k.shape}"
+            # img_q, img_k = img_qq, img_kk
+            # del img_qq, img_kk
+            q = torch.cat((img_q, txt_q), dim=1)
+            k = torch.cat((img_k, txt_k), dim=1)
+            del img_q, txt_q, img_k, txt_k
+        # Compute attention.
+        assert cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1, f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            l = [q, k, v]
+            q = k = v = None
+            attn = attention(
+                l,
+                mode=self.attn_mode,
+                attn_mask=attn_mask,
+                total_len=total_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=x.shape[0],
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+            )
+        # attention computation end
+        # Compute activation in mlp stream, cat again and run second linear layer.
+        # mlp = mlp.to(x.device)
+        mlp = self.mlp_act(mlp)
+        attn_mlp = torch.cat((attn, mlp), 2)
+        attn = None
+        mlp = None
+        output = self.linear2(attn_mlp)
+        attn_mlp = None
+        return x + apply_gate(output, gate=mod_gate)
+    # def forward(
+    #     self,
+    #     x: torch.Tensor,
+    #     vec: torch.Tensor,
+    #     txt_len: int,
+    #     attn_mask: Optional[torch.Tensor] = None,
+    #     cu_seqlens_q: Optional[torch.Tensor] = None,
+    #     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    #     max_seqlen_q: Optional[int] = None,
+    #     max_seqlen_kv: Optional[int] = None,
+    #     freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    # ) -> torch.Tensor:
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class HYVideoDiffusionTransformer(nn.Module):  # ModelMixin, ConfigMixin):
+    """
+    HunyuanVideo Transformer backbone
+    Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+    Reference:
+    [1] Flux.1: https://github.com/black-forest-labs/flux
+    [2] MMDiT: http://arxiv.org/abs/2403.03206
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    patch_size: list
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    out_channels: int
+        The number of output channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    heads_num: int
+        The number of attention heads.
+    mlp_width_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    mlp_act_type: str
+        The activation function of the MLP in the transformer block.
+    depth_double_blocks: int
+        The number of transformer blocks in the double blocks.
+    depth_single_blocks: int
+        The number of transformer blocks in the single blocks.
+    rope_dim_list: list
+        The dimension of the rotary embedding for t, h, w.
+    qkv_bias: bool
+        Whether to use bias in the qkv linear layer.
+    qk_norm: bool
+        Whether to use qk norm.
+    qk_norm_type: str
+        The type of qk norm.
+    guidance_embed: bool
+        Whether to use guidance embedding for distillation.
+    text_projection: str
+        The type of the text projection, default is single_refiner.
+    use_attention_mask: bool
+        Whether to use attention mask for text encoder.
+    dtype: torch.dtype
+        The dtype of the model.
+    device: torch.device
+        The device of the model.
+    attn_mode: str
+        The mode of the attention, default is flash.
+    split_attn: bool
+        Whether to use split attention (make attention as batch size 1).
+    """
+    # @register_to_config
+    def __init__(
+        self,
+        text_states_dim: int,
+        text_states_dim_2: int,
+        patch_size: list = [1, 2, 2],
+        in_channels: int = 4,  # Should be VAE.config.latent_channels.
+        out_channels: int = None,
+        hidden_size: int = 3072,
+        heads_num: int = 24,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        mm_double_blocks_depth: int = 20,
+        mm_single_blocks_depth: int = 40,
+        rope_dim_list: List[int] = [16, 56, 56],
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        guidance_embed: bool = False,  # For modulation.
+        text_projection: str = "single_refiner",
+        use_attention_mask: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = guidance_embed
+        self.rope_dim_list = rope_dim_list
+        # Text projection. Default to linear projection.
+        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+        self.use_attention_mask = use_attention_mask
+        self.text_projection = text_projection
+        self.text_states_dim = text_states_dim
+        self.text_states_dim_2 = text_states_dim_2
+        if hidden_size % heads_num != 0:
+            raise ValueError(f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}")
+        pe_dim = hidden_size // heads_num
+        if sum(rope_dim_list) != pe_dim:
+            raise ValueError(f"Got {rope_dim_list} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        print(f"Using {self.attn_mode} attention mode, split_attn: {self.split_attn}")
+        # image projection
+        self.img_in = PatchEmbed(self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs)
+        # text projection
+        if self.text_projection == "linear":
+            self.txt_in = TextProjection(
+                self.text_states_dim,
+                self.hidden_size,
+                get_activation_layer("silu"),
+                **factory_kwargs,
+            )
+        elif self.text_projection == "single_refiner":
+            self.txt_in = SingleTokenRefiner(self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+        # time modulation
+        self.time_in = TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs)
+        # text modulation
+        self.vector_in = MLPEmbedder(self.text_states_dim_2, self.hidden_size, **factory_kwargs)
+        # guidance modulation
+        self.guidance_in = (
+            TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs) if guidance_embed else None
+        )
+        # double blocks
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDoubleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_double_blocks_depth)
+            ]
+        )
+        # single blocks
+        self.single_blocks = nn.ModuleList(
+            [
+                MMSingleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_single_blocks_depth)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            self.hidden_size,
+            self.patch_size,
+            self.out_channels,
+            get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.gradient_checkpointing = False
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        self._enable_img_in_txt_in_offloading = False
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        self.txt_in.enable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing()
+        print(f"HYVideoDiffusionTransformer: Gradient checkpointing enabled.")
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.txt_in.disable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+        print(f"HYVideoDiffusionTransformer: Gradient checkpointing disabled.")
+    def enable_img_in_txt_in_offloading(self):
+        self._enable_img_in_txt_in_offloading = True
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = num_blocks
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
+        assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
+            f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = ModelOffloader(
+            "double", self.double_blocks, self.num_double_blocks, double_blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        self.offloader_single = ModelOffloader(
+            "single", self.single_blocks, self.num_single_blocks, single_blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        print(
+            f"HYVideoDiffusionTransformer: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HYVideoDiffusionTransformer: Block swap set to forward only.")
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HYVideoDiffusionTransformer: Block swap set to forward and backward.")
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+    def enable_deterministic(self):
+        for block in self.double_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+    def disable_deterministic(self):
+        for block in self.double_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,  # Should be in range(0, 1000).
+        text_states: torch.Tensor = None,
+        text_mask: torch.Tensor = None,  # Now we don't use it.
+        text_states_2: Optional[torch.Tensor] = None,  # Text embedding for modulation.
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+        guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        out = {}
+        img = x
+        txt = text_states
+        _, _, ot, oh, ow = x.shape
+        tt, th, tw = (
+            ot // self.patch_size[0],
+            oh // self.patch_size[1],
+            ow // self.patch_size[2],
+        )
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+        # text modulation
+        vec = vec + self.vector_in(text_states_2)
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+            vec = vec + self.guidance_in(guidance)
+        # Embed image and text.
+        if self._enable_img_in_txt_in_offloading:
+            self.img_in.to(x.device, non_blocking=True)
+            self.txt_in.to(x.device, non_blocking=True)
+            synchronize_device(x.device)
+        img = self.img_in(img)
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+        if self._enable_img_in_txt_in_offloading:
+            self.img_in.to(torch.device("cpu"), non_blocking=True)
+            self.txt_in.to(torch.device("cpu"), non_blocking=True)
+            synchronize_device(x.device)
+            clean_memory_on_device(x.device)
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+        # Compute cu_squlens and max_seqlen for flash attention
+        cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+        cu_seqlens_kv = cu_seqlens_q
+        max_seqlen_q = img_seq_len + txt_seq_len
+        max_seqlen_kv = max_seqlen_q
+        attn_mask = total_len = None
+        if self.split_attn or self.attn_mode == "torch":
+            # calculate text length and total length
+            text_len = text_mask.sum(dim=1)  #  (bs, )
+            total_len = img_seq_len + text_len  # (bs, )
+        if self.attn_mode == "torch" and not self.split_attn:
+            # initialize attention mask: bool tensor for sdpa, (b, 1, n, n)
+            bs = img.shape[0]
+            attn_mask = torch.zeros((bs, 1, max_seqlen_q, max_seqlen_q), dtype=torch.bool, device=text_mask.device)
+            # set attention mask with total_len
+            for i in range(bs):
+                attn_mask[i, :, : total_len[i], : total_len[i]] = True
+            total_len = None  # means we don't use split_attn
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        # --------------------- Pass through DiT blocks ------------------------
+        for block_idx, block in enumerate(self.double_blocks):
+            double_block_args = [
+                img,
+                txt,
+                vec,
+                attn_mask,
+                total_len,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                max_seqlen_q,
+                max_seqlen_kv,
+                freqs_cis,
+            ]
+            if self.blocks_to_swap:
+                self.offloader_double.wait_for_block(block_idx)
+            img, txt = block(*double_block_args)
+            if self.blocks_to_swap:
+                self.offloader_double.submit_move_blocks_forward(self.double_blocks, block_idx)
+        # Merge txt and img to pass through single stream blocks.
+        x = torch.cat((img, txt), 1)
+        if self.blocks_to_swap:
+            # delete img, txt to reduce memory usage
+            del img, txt
+            clean_memory_on_device(x.device)
+        if len(self.single_blocks) > 0:
+            for block_idx, block in enumerate(self.single_blocks):
+                single_block_args = [
+                    x,
+                    vec,
+                    txt_seq_len,
+                    attn_mask,
+                    total_len,
+                    cu_seqlens_q,
+                    cu_seqlens_kv,
+                    max_seqlen_q,
+                    max_seqlen_kv,
+                    freqs_cis,
+                ]
+                if self.blocks_to_swap:
+                    self.offloader_single.wait_for_block(block_idx)
+                x = block(*single_block_args)
+                if self.blocks_to_swap:
+                    self.offloader_single.submit_move_blocks_forward(self.single_blocks, block_idx)
+        img = x[:, :img_seq_len, ...]
+        x = None
+        # ---------------------------- Final layer ------------------------------
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.unpatchify(img, tt, th, tw)
+        if return_dict:
+            out["x"] = img
+            return out
+        return img
+    def unpatchify(self, x, t, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        pt, ph, pw = self.patch_size
+        assert t * h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+        x = torch.einsum("nthwcopq->nctohpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+        return imgs
+    def params_count(self):
+        counts = {
+            "double": sum(
+                [
+                    sum(p.numel() for p in block.img_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.img_attn_proj.parameters())
+                    + sum(p.numel() for p in block.img_mlp.parameters())
+                    + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.txt_attn_proj.parameters())
+                    + sum(p.numel() for p in block.txt_mlp.parameters())
+                    for block in self.double_blocks
+                ]
+            ),
+            "single": sum(
+                [
+                    sum(p.numel() for p in block.linear1.parameters()) + sum(p.numel() for p in block.linear2.parameters())
+                    for block in self.single_blocks
+                ]
+            ),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["double"] + counts["single"]
+        return counts
+#################################################################################
+#                             HunyuanVideo Configs                              #
+#################################################################################
+HUNYUAN_VIDEO_CONFIG = {
+    "HYVideo-T/2": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+    },
+    "HYVideo-T/2-cfgdistill": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        "guidance_embed": True,
+    },
+}
+def load_dit_model(text_states_dim, text_states_dim_2, in_channels, out_channels, factor_kwargs):
+    """load hunyuan video model
+    NOTE: Only support HYVideo-T/2-cfgdistill now.
+    Args:
+        text_state_dim (int): text state dimension
+        text_state_dim_2 (int): text state dimension 2
+        in_channels (int): input channels number
+        out_channels (int): output channels number
+        factor_kwargs (dict): factor kwargs
+    Returns:
+        model (nn.Module): The hunyuan video model
+    """
+    # if args.model in HUNYUAN_VIDEO_CONFIG.keys():
+    model = HYVideoDiffusionTransformer(
+        text_states_dim=text_states_dim,
+        text_states_dim_2=text_states_dim_2,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        **HUNYUAN_VIDEO_CONFIG["HYVideo-T/2-cfgdistill"],
+        **factor_kwargs,
+    )
+    return model
+    # else:
+    #     raise NotImplementedError()
+def load_state_dict(model, model_path):
+    state_dict = torch.load(model_path, map_location=lambda storage, loc: storage, weights_only=True)
+    load_key = "module"
+    if load_key in state_dict:
+        state_dict = state_dict[load_key]
+    else:
+        raise KeyError(
+            f"Missing key: `{load_key}` in the checkpoint: {model_path}. The keys in the checkpoint "
+            f"are: {list(state_dict.keys())}."
+        )
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    return model
+def load_transformer(dit_path, attn_mode, split_attn, device, dtype, in_channels=16) -> HYVideoDiffusionTransformer:
+    # =========================== Build main model ===========================
+    factor_kwargs = {"device": device, "dtype": dtype, "attn_mode": attn_mode, "split_attn": split_attn}
+    latent_channels = 16
+    out_channels = latent_channels
+    with accelerate.init_empty_weights():
+        transformer = load_dit_model(
+            text_states_dim=4096,
+            text_states_dim_2=768,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor_kwargs=factor_kwargs,
+        )
+    if os.path.splitext(dit_path)[-1] == ".safetensors":
+        # loading safetensors: may be already fp8
+        with MemoryEfficientSafeOpen(dit_path) as f:
+            state_dict = {}
+            for k in f.keys():
+                tensor = f.get_tensor(k)
+                tensor = tensor.to(device=device, dtype=dtype)
+                # TODO support comfy model
+                # if k.startswith("model.model."):
+                #     k = convert_comfy_model_key(k)
+                state_dict[k] = tensor
+        transformer.load_state_dict(state_dict, strict=True, assign=True)
+    else:
+        transformer = load_state_dict(transformer, dit_path)
+    return transformer
+def get_rotary_pos_embed_by_shape(model, latents_size):
+    target_ndim = 3
+    ndim = 5 - 2
+    if isinstance(model.patch_size, int):
+        assert all(s % model.patch_size == 0 for s in latents_size), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size for s in latents_size]
+    elif isinstance(model.patch_size, list):
+        assert all(s % model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size[idx] for idx, s in enumerate(latents_size)]
+    if len(rope_sizes) != target_ndim:
+        rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+    head_dim = model.hidden_size // model.heads_num
+    rope_dim_list = model.rope_dim_list
+    if rope_dim_list is None:
+        rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+    assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+    rope_theta = 256
+    freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+        rope_dim_list, rope_sizes, theta=rope_theta, use_real=True, theta_rescale_factor=1
+    )
+    return freqs_cos, freqs_sin
+def get_rotary_pos_embed(vae_name, model, video_length, height, width):
+    # 884
+    if "884" in vae_name:
+        latents_size = [(video_length - 1) // 4 + 1, height // 8, width // 8]
+    elif "888" in vae_name:
+        latents_size = [(video_length - 1) // 8 + 1, height // 8, width // 8]
+    else:
+        latents_size = [video_length, height // 8, width // 8]
+    return get_rotary_pos_embed_by_shape(model, latents_size)

hunyuan_model/modulate_layers.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import Callable
+import torch
+import torch.nn as nn
+class ModulateDiT(nn.Module):
+    """Modulation layer for DiT."""
+    def __init__(
+        self,
+        hidden_size: int,
+        factor: int,
+        act_layer: Callable,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(
+            hidden_size, factor * hidden_size, bias=True, **factory_kwargs
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(x))
+def modulate(x, shift=None, scale=None):
+    """modulate by shift and scale
+    Args:
+        x (torch.Tensor): input tensor.
+        shift (torch.Tensor, optional): shift tensor. Defaults to None.
+        scale (torch.Tensor, optional): scale tensor. Defaults to None.
+    Returns:
+        torch.Tensor: the output tensor after modulate.
+    """
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def apply_gate(x, gate=None, tanh=False):
+    """AI is creating summary for apply_gate
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+def ckpt_wrapper(module):
+    def ckpt_forward(*inputs):
+        outputs = module(*inputs)
+        return outputs
+    return ckpt_forward

hunyuan_model/norm_layers.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            # output = output * self.weight
+            # support fp8
+            output = output * self.weight.to(output.dtype)
+        return output
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+    Args:
+        norm_layer (str): The type of normalization layer.
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")

hunyuan_model/pipeline_hunyuan_video.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import torch
+import torch.distributed as dist
+import numpy as np
+from dataclasses import dataclass
+from packaging import version
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from ...constants import PRECISION_TO_TYPE
+from ...vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ...text_encoder import TextEncoder
+from ...modules import HYVideoDiffusionTransformer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class HunyuanVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`TextEncoder`]):
+            Frozen text-encoder.
+        text_encoder_2 ([`TextEncoder`]):
+            Frozen text-encoder_2.
+        transformer ([`HYVideoDiffusionTransformer`]):
+            A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = ["text_encoder_2"]
+    _exclude_from_cpu_offload = ["transformer"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: TextEncoder,
+        transformer: HYVideoDiffusionTransformer,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_2: Optional[TextEncoder] = None,
+        progress_bar_config: Dict[str, Any] = None,
+        args=None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        self.args = args
+        # ==========================================================================================
+        if (
+            hasattr(scheduler.config, "steps_offset")
+            and scheduler.config.steps_offset != 1
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate(
+                "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if (
+            hasattr(scheduler.config, "clip_sample")
+            and scheduler.config.clip_sample is True
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of videos that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            attention_mask (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            text_encoder (TextEncoder, *optional*):
+            data_type (`str`, *optional*):
+        """
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs, data_type=data_type, device=device
+                )
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs,
+                    output_hidden_states=True,
+                    data_type=data_type,
+                    device=device,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(
+                    prompt_embeds
+                )
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+                attention_mask = attention_mask.view(
+                    bs_embed * num_videos_per_prompt, seq_len
+                )
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.transformer is not None:
+            prompt_embeds_dtype = self.transformer.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(
+                bs_embed * num_videos_per_prompt, seq_len, -1
+            )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, text_encoder.tokenizer
+                )
+            # max_length = prompt_embeds.shape[1]
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type)
+            negative_prompt_outputs = text_encoder.encode(
+                uncond_input, data_type=data_type, device=device
+            )
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_attention_mask = negative_attention_mask.view(
+                    batch_size * num_videos_per_prompt, seq_len
+                )
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device
+            )
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, -1
+                )
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt, 1
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, seq_len, -1
+                )
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+    def decode_latents(self, latents, enable_tiling=True):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if enable_tiling:
+            self.vae.enable_tiling()
+            image = self.vae.decode(latents, return_dict=False)[0]
+        else:
+            image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        if image.ndim == 4:
+            image = image.cpu().permute(0, 2, 3, 1).float()
+        else:
+            image = image.cpu().float()
+        return image
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_step_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        video_length,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        vae_ver="88-4c-sd",
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if video_length is not None:
+            if "884" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 4 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 4 but is {video_length}."
+                    )
+            elif "888" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 8 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 8 but is {video_length}."
+                    )
+        if callback_steps is not None and (
+            not isinstance(callback_steps, int) or callback_steps <= 0
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs
+            for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        video_length,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self,
+        w: torch.Tensor,
+        embedding_dim: int = 512,
+        dtype: torch.dtype = torch.float32,
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+        return self._guidance_scale > 1
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        video_length: int,
+        data_type: str = "video",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        vae_ver: str = "88-4c-sd",
+        enable_tiling: bool = False,
+        n_tokens: Optional[int] = None,
+        embedded_guidance_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            video_length (`int`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~HunyuanVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            video_length,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            vae_ver=vae_ver,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = torch.device(f"cuda:{dist.get_rank()}") if dist.is_initialized() else self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_mask,
+            negative_prompt_mask,
+        ) = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            attention_mask=attention_mask,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_attention_mask=negative_attention_mask,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+            data_type=data_type,
+        )
+        if self.text_encoder_2 is not None:
+            (
+                prompt_embeds_2,
+                negative_prompt_embeds_2,
+                prompt_mask_2,
+                negative_prompt_mask_2,
+            ) = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=None,
+                attention_mask=None,
+                negative_prompt_embeds=None,
+                negative_attention_mask=None,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+                text_encoder=self.text_encoder_2,
+                data_type=data_type,
+            )
+        else:
+            prompt_embeds_2 = None
+            negative_prompt_embeds_2 = None
+            prompt_mask_2 = None
+            negative_prompt_mask_2 = None
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if prompt_mask is not None:
+                prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
+            if prompt_embeds_2 is not None:
+                prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            if prompt_mask_2 is not None:
+                prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+        # 4. Prepare timesteps
+        extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            **extra_set_timesteps_kwargs,
+        )
+        if "884" in vae_ver:
+            video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            video_length = (video_length - 1) // 8 + 1
+        else:
+            video_length = video_length
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            video_length,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {"generator": generator, "eta": eta},
+        )
+        target_dtype = PRECISION_TO_TYPE[self.args.precision]
+        autocast_enabled = (
+            target_dtype != torch.float32
+        ) and not self.args.disable_autocast
+        vae_dtype = PRECISION_TO_TYPE[self.args.vae_precision]
+        vae_autocast_enabled = (
+            vae_dtype != torch.float32
+        ) and not self.args.disable_autocast
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        # if is_progress_bar:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                t_expand = t.repeat(latent_model_input.shape[0])
+                guidance_expand = (
+                    torch.tensor(
+                        [embedded_guidance_scale] * latent_model_input.shape[0],
+                        dtype=torch.float32,
+                        device=device,
+                    ).to(target_dtype)
+                    * 1000.0
+                    if embedded_guidance_scale is not None
+                    else None
+                )
+                # predict the noise residual
+                with torch.autocast(
+                    device_type="cuda", dtype=target_dtype, enabled=autocast_enabled
+                ):
+                    noise_pred = self.transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                        latent_model_input,  # [2, 16, 33, 24, 42]
+                        t_expand,  # [2]
+                        text_states=prompt_embeds,  # [2, 256, 4096]
+                        text_mask=prompt_mask,  # [2, 256]
+                        text_states_2=prompt_embeds_2,  # [2, 768]
+                        freqs_cos=freqs_cis[0],  # [seqlen, head_dim]
+                        freqs_sin=freqs_cis[1],  # [seqlen, head_dim]
+                        guidance=guidance_expand,
+                        return_dict=True,
+                    )[
+                        "x"
+                    ]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    if progress_bar is not None:
+                        progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            expand_temporal_dim = False
+            if len(latents.shape) == 4:
+                if isinstance(self.vae, AutoencoderKLCausal3D):
+                    latents = latents.unsqueeze(2)
+                    expand_temporal_dim = True
+            elif len(latents.shape) == 5:
+                pass
+            else:
+                raise ValueError(
+                    f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}."
+                )
+            if (
+                hasattr(self.vae.config, "shift_factor")
+                and self.vae.config.shift_factor
+            ):
+                latents = (
+                    latents / self.vae.config.scaling_factor
+                    + self.vae.config.shift_factor
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            with torch.autocast(
+                device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled
+            ):
+                if enable_tiling:
+                    self.vae.enable_tiling()
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+                else:
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+            if expand_temporal_dim or image.shape[2] == 1:
+                image = image.squeeze(2)
+        else:
+            image = latents
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().float()
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return image
+        return HunyuanVideoPipelineOutput(videos=image)

hunyuan_model/posemb_layers.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import torch
+from typing import Union, Tuple, List
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+def get_meshgrid_nd(start, *args, dim=2):
+    """
+    Get n-D meshgrid with start, stop and num.
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)  # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)  # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)  # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")  # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)  # [dim, W, H, D]
+    return grid
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+def reshape_for_broadcast(
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    x: torch.Tensor,
+    head_first=False,
+):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Notes:
+        When using FlashMHAModified, head_first should be False.
+        When using Attention, head_first should be True.
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis[0].shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis.shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+def rotate_half(x):
+    x_real, x_imag = (
+        x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    )  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)  # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        # real * cos - imag * sin
+        # imag * cos + real * sin
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+        xq_ = torch.view_as_complex(
+            xq.float().reshape(*xq.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(
+            xq.device
+        )  # [S, D//2] --> [1, S, 1, D//2]
+        # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+        # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        xk_ = torch.view_as_complex(
+            xk.float().reshape(*xk.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+    return xq_out, xk_out
+def get_nd_rotary_pos_embed(
+    rope_dim_list,
+    start,
+    *args,
+    theta=10000.0,
+    use_real=False,
+    theta_rescale_factor: Union[float, List[float]] = 1.0,
+    interpolation_factor: Union[float, List[float]] = 1.0,
+):
+    """
+    This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
+    Args:
+        rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
+            sum(rope_dim_list) should equal to head_dim of attention layer.
+        start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
+            args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+        *args: See above.
+        theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+            Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
+            part and an imaginary part separately.
+        theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
+    Returns:
+        pos_embed (torch.Tensor): [HW, D/2]
+    """
+    grid = get_meshgrid_nd(
+        start, *args, dim=len(rope_dim_list)
+    )  # [3, W, H, D] / [2, W, H]
+    if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
+        theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+    elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+        theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+    assert len(theta_rescale_factor) == len(
+        rope_dim_list
+    ), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
+    if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
+        interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+    elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+        interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+    assert len(interpolation_factor) == len(
+        rope_dim_list
+    ), "len(interpolation_factor) should equal to len(rope_dim_list)"
+    # use 1/ndim of dimensions to encode grid_axis
+    embs = []
+    for i in range(len(rope_dim_list)):
+        emb = get_1d_rotary_pos_embed(
+            rope_dim_list[i],
+            grid[i].reshape(-1),
+            theta,
+            use_real=use_real,
+            theta_rescale_factor=theta_rescale_factor[i],
+            interpolation_factor=interpolation_factor[i],
+        )  # 2 x [WHD, rope_dim_list[i]]
+        embs.append(emb)
+    if use_real:
+        cos = torch.cat([emb[0] for emb in embs], dim=1)  # (WHD, D/2)
+        sin = torch.cat([emb[1] for emb in embs], dim=1)  # (WHD, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat(embs, dim=1)  # (WHD, D/2)
+        return emb
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[torch.FloatTensor, int],
+    theta: float = 10000.0,
+    use_real: bool = False,
+    theta_rescale_factor: float = 1.0,
+    interpolation_factor: float = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+    Returns:
+        freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    if theta_rescale_factor != 1.0:
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    )  # [D/2]
+    # assert interpolation_factor == 1.0, f"interpolation_factor: {interpolation_factor}"
+    freqs = torch.outer(pos * interpolation_factor, freqs)  # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(
+            torch.ones_like(freqs), freqs
+        )  # complex64     # [S, D/2]
+        return freqs_cis

hunyuan_model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,710 @@

+from dataclasses import dataclass
+import json
+import os
+from typing import Optional, Tuple, Union
+from copy import deepcopy
+import torch
+import torch.nn as nn
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    AutoTokenizer,
+    AutoModel,
+    CLIPConfig,
+    LlamaForCausalLM,
+    LlamaConfig,
+)
+from transformers.utils import ModelOutput
+from transformers.models.llama import LlamaModel
+from safetensors.torch import load_file
+from accelerate import init_empty_weights
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+CLIP_L_HUGGINGFACE_MODEL_ID = "openai/clip-vit-large-patch14"
+LLAVA_HUGGINGFACE_MODEL_ID = "xtuner/llava-llama-3-8b-v1_1-transformers"
+CLIP_CONFIG = {
+    "_name_or_path": "clip-vit-large-patch14/",
+    "architectures": ["CLIPModel"],
+    "initializer_factor": 1.0,
+    "logit_scale_init_value": 2.6592,
+    "model_type": "clip",
+    "projection_dim": 768,
+    #   "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": False,
+    "architectures": None,
+    "attention_dropout": 0.0,
+    "bad_words_ids": None,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": None,
+    "decoder_start_token_id": None,
+    "diversity_penalty": 0.0,
+    "do_sample": False,
+    "dropout": 0.0,
+    "early_stopping": False,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": None,
+    "forced_bos_token_id": None,
+    "forced_eos_token_id": None,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": False,
+    "is_encoder_decoder": False,
+    "label2id": {"LABEL_0": 0, "LABEL_1": 1},
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": False,
+    "output_hidden_states": False,
+    "output_scores": False,
+    "pad_token_id": 1,
+    "prefix": None,
+    "problem_type": None,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": False,
+    "repetition_penalty": 1.0,
+    "return_dict": True,
+    "return_dict_in_generate": False,
+    "sep_token_id": None,
+    "task_specific_params": None,
+    "temperature": 1.0,
+    "tie_encoder_decoder": False,
+    "tie_word_embeddings": True,
+    "tokenizer_class": None,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": None,
+    "torchscript": False,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": False,
+    "vocab_size": 49408,
+    #   },
+    #   "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "projection_dim": 768,
+    #   },
+    #   "torch_dtype": "float32",
+    #   "transformers_version": null
+}
+LLAMA_CONFIG = {
+    "architectures": ["LlamaForCausalLM"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 8192,
+    "mlp_bias": False,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": False,
+    "torch_dtype": "float16",
+    "transformers_version": "4.46.3",
+    "use_cache": True,
+    "vocab_size": 128320,
+}
+# When using decoder-only models, we must provide a prompt template to instruct the text encoder
+# on how to generate the text.
+# --------------------------------------------------------------------
+PROMPT_TEMPLATE_ENCODE = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
+    "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
+PROMPT_TEMPLATE = {
+    "dit-llm-encode": {
+        "template": PROMPT_TEMPLATE_ENCODE,
+        "crop_start": 36,
+    },
+    "dit-llm-encode-video": {
+        "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
+        "crop_start": 95,
+    },
+}
+def use_default(value, default):
+    return value if value is not None else default
+def load_clip_l(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
+    if os.path.isdir(text_encoder_path):
+        # load from directory, configs are in the directory
+        text_encoder = CLIPTextModel.from_pretrained(text_encoder_path, torch_dtype=dtype)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = CLIPConfig(**CLIP_CONFIG)
+        with init_empty_weights():
+            text_encoder = CLIPTextModel._from_config(config, torch_dtype=dtype)
+        state_dict = load_file(text_encoder_path)
+        text_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    # if dtype is not None:
+    #     text_encoder.to(dtype=dtype)
+    return text_encoder
+def load_clip_l_tokenizer(tokenizer_path: str):
+    if os.path.isdir(tokenizer_path):
+        tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
+    else:
+        # load from Hugging Face
+        logger.info(f"Loading tokenizer from Hugging Face: {CLIP_L_HUGGINGFACE_MODEL_ID}")
+        tokenizer = CLIPTokenizer.from_pretrained(CLIP_L_HUGGINGFACE_MODEL_ID, max_length=77)
+    return tokenizer
+def load_llm(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
+    if os.path.isdir(text_encoder_path):
+        # load from directory, configs are in the directory
+        text_encoder = AutoModel.from_pretrained(text_encoder_path, low_cpu_mem_usage=True, torch_dtype=dtype)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = LlamaConfig(**LLAMA_CONFIG)
+        with init_empty_weights():
+            text_encoder = LlamaForCausalLM._from_config(config, torch_dtype=dtype)
+        state_dict = load_file(text_encoder_path)
+        # support weights from ComfyUI
+        if "tokenizer" in state_dict:
+            state_dict.pop("tokenizer")
+        text_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    return text_encoder
+def load_llm_tokenizer(tokenizer_path: str, padding_side="right"):
+    if os.path.isdir(tokenizer_path):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    else:
+        # load from Hugging Face
+        logger.info(f"Loading tokenizer from Hugging Face: {LLAVA_HUGGINGFACE_MODEL_ID}")
+        tokenizer = AutoTokenizer.from_pretrained(LLAVA_HUGGINGFACE_MODEL_ID, padding_side=padding_side)
+    return tokenizer
+def load_text_encoder(
+    text_encoder_type: str,
+    text_encoder_path: str,
+    text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
+):
+    logger.info(f"Loading text encoder model ({text_encoder_type}) from: {text_encoder_path}")
+    # reduce peak memory usage by specifying the dtype of the model
+    dtype = text_encoder_dtype
+    if text_encoder_type == "clipL":
+        text_encoder = load_clip_l(text_encoder_path, dtype=dtype)
+        text_encoder.final_layer_norm = text_encoder.text_model.final_layer_norm
+    elif text_encoder_type == "llm":
+        text_encoder = load_llm(text_encoder_path, dtype=dtype)
+        if hasattr(text_encoder, "norm"):
+            text_encoder.final_layer_norm = text_encoder.norm  # by from_pretrained
+        else:
+            text_encoder.final_layer_norm = text_encoder.model.norm  # by _from_config
+    else:
+        raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+    # from_pretrained will ensure that the model is in eval mode.
+    if dtype is not None:
+        text_encoder = text_encoder.to(dtype=dtype)
+    text_encoder.requires_grad_(False)
+    logger.info(f"Text encoder to dtype: {text_encoder.dtype}")
+    return text_encoder, text_encoder_path
+def load_tokenizer(tokenizer_type, tokenizer_path=None, padding_side="right"):
+    logger.info(f"Loading tokenizer ({tokenizer_type}) from: {tokenizer_path}")
+    if tokenizer_type == "clipL":
+        tokenizer = load_clip_l_tokenizer(tokenizer_path)
+    elif tokenizer_type == "llm":
+        tokenizer = load_llm_tokenizer(tokenizer_path, padding_side=padding_side)
+    else:
+        raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
+    return tokenizer, tokenizer_path
+@dataclass
+class TextEncoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+        hidden_states_list (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        text_outputs (`list`, *optional*, returned when `return_texts=True` is passed):
+            List of decoded texts.
+    """
+    hidden_state: torch.FloatTensor = None
+    attention_mask: Optional[torch.LongTensor] = None
+    hidden_states_list: Optional[Tuple[torch.FloatTensor, ...]] = None
+    text_outputs: Optional[list] = None
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        text_encoder_type: str,
+        max_length: int,
+        text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
+        text_encoder_path: Optional[str] = None,
+        tokenizer_type: Optional[str] = None,
+        tokenizer_path: Optional[str] = None,
+        output_key: Optional[str] = None,
+        use_attention_mask: bool = True,
+        input_max_length: Optional[int] = None,
+        prompt_template: Optional[dict] = None,
+        prompt_template_video: Optional[dict] = None,
+        hidden_state_skip_layer: Optional[int] = None,
+        apply_final_norm: bool = False,
+        reproduce: bool = False,
+    ):
+        super().__init__()
+        self.text_encoder_type = text_encoder_type
+        self.max_length = max_length
+        # self.precision = text_encoder_precision
+        self.model_path = text_encoder_path
+        self.tokenizer_type = tokenizer_type if tokenizer_type is not None else text_encoder_type
+        self.tokenizer_path = tokenizer_path if tokenizer_path is not None else text_encoder_path
+        self.use_attention_mask = use_attention_mask
+        if prompt_template_video is not None:
+            assert use_attention_mask is True, "Attention mask is True required when training videos."
+        self.input_max_length = input_max_length if input_max_length is not None else max_length
+        self.prompt_template = prompt_template
+        self.prompt_template_video = prompt_template_video
+        self.hidden_state_skip_layer = hidden_state_skip_layer
+        self.apply_final_norm = apply_final_norm
+        self.reproduce = reproduce
+        self.use_template = self.prompt_template is not None
+        if self.use_template:
+            assert (
+                isinstance(self.prompt_template, dict) and "template" in self.prompt_template
+            ), f"`prompt_template` must be a dictionary with a key 'template', got {self.prompt_template}"
+            assert "{}" in str(self.prompt_template["template"]), (
+                "`prompt_template['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template['template']}"
+            )
+        self.use_video_template = self.prompt_template_video is not None
+        if self.use_video_template:
+            if self.prompt_template_video is not None:
+                assert (
+                    isinstance(self.prompt_template_video, dict) and "template" in self.prompt_template_video
+                ), f"`prompt_template_video` must be a dictionary with a key 'template', got {self.prompt_template_video}"
+            assert "{}" in str(self.prompt_template_video["template"]), (
+                "`prompt_template_video['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template_video['template']}"
+            )
+        if "t5" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        elif "clip" in text_encoder_type:
+            self.output_key = output_key or "pooler_output"
+        elif "llm" in text_encoder_type or "glm" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        else:
+            raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+        self.model, self.model_path = load_text_encoder(
+            text_encoder_type=self.text_encoder_type, text_encoder_path=self.model_path, text_encoder_dtype=text_encoder_dtype
+        )
+        self.dtype = self.model.dtype
+        self.tokenizer, self.tokenizer_path = load_tokenizer(
+            tokenizer_type=self.tokenizer_type, tokenizer_path=self.tokenizer_path, padding_side="right"
+        )
+    def __repr__(self):
+        return f"{self.text_encoder_type} ({self.precision} - {self.model_path})"
+    @property
+    def device(self):
+        return self.model.device
+    @staticmethod
+    def apply_text_to_template(text, template, prevent_empty_text=True):
+        """
+        Apply text to template.
+        Args:
+            text (str): Input text.
+            template (str or list): Template string or list of chat conversation.
+            prevent_empty_text (bool): If Ture, we will prevent the user text from being empty
+                by adding a space. Defaults to True.
+        """
+        if isinstance(template, str):
+            # Will send string to tokenizer. Used for llm
+            return template.format(text)
+        else:
+            raise TypeError(f"Unsupported template type: {type(template)}")
+    def text2tokens(self, text, data_type="image"):
+        """
+        Tokenize the input text.
+        Args:
+            text (str or list): Input text.
+        """
+        tokenize_input_type = "str"
+        if self.use_template:
+            if data_type == "image":
+                prompt_template = self.prompt_template["template"]
+            elif data_type == "video":
+                prompt_template = self.prompt_template_video["template"]
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if isinstance(text, (list, tuple)):
+                text = [self.apply_text_to_template(one_text, prompt_template) for one_text in text]
+                if isinstance(text[0], list):
+                    tokenize_input_type = "list"
+            elif isinstance(text, str):
+                text = self.apply_text_to_template(text, prompt_template)
+                if isinstance(text, list):
+                    tokenize_input_type = "list"
+            else:
+                raise TypeError(f"Unsupported text type: {type(text)}")
+        kwargs = dict(
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        if tokenize_input_type == "str":
+            return self.tokenizer(
+                text,
+                return_length=False,
+                return_overflowing_tokens=False,
+                return_attention_mask=True,
+                **kwargs,
+            )
+        elif tokenize_input_type == "list":
+            return self.tokenizer.apply_chat_template(
+                text,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}")
+    def encode(
+        self,
+        batch_encoding,
+        use_attention_mask=None,
+        output_hidden_states=False,
+        do_sample=None,
+        hidden_state_skip_layer=None,
+        return_texts=False,
+        data_type="image",
+        device=None,
+    ):
+        """
+        Args:
+            batch_encoding (dict): Batch encoding from tokenizer.
+            use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask.
+                Defaults to None.
+            output_hidden_states (bool): Whether to output hidden states. If False, return the value of
+                self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer,
+                output_hidden_states will be set True. Defaults to False.
+            do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None.
+                When self.produce is False, do_sample is set to True by default.
+            hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer.
+                If None, self.output_key will be used. Defaults to None.
+            return_texts (bool): Whether to return the decoded texts. Defaults to False.
+        """
+        device = self.model.device if device is None else device
+        use_attention_mask = use_default(use_attention_mask, self.use_attention_mask)
+        hidden_state_skip_layer = use_default(hidden_state_skip_layer, self.hidden_state_skip_layer)
+        do_sample = use_default(do_sample, not self.reproduce)
+        attention_mask = batch_encoding["attention_mask"].to(device) if use_attention_mask else None
+        outputs = self.model(
+            input_ids=batch_encoding["input_ids"].to(device),
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None,
+        )
+        if hidden_state_skip_layer is not None:
+            last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
+            # Real last hidden state already has layer norm applied. So here we only apply it
+            # for intermediate layers.
+            if hidden_state_skip_layer > 0 and self.apply_final_norm:
+                last_hidden_state = self.model.final_layer_norm(last_hidden_state)
+        else:
+            last_hidden_state = outputs[self.output_key]
+        # Remove hidden states of instruction tokens, only keep prompt tokens.
+        if self.use_template:
+            if data_type == "image":
+                crop_start = self.prompt_template.get("crop_start", -1)
+            elif data_type == "video":
+                crop_start = self.prompt_template_video.get("crop_start", -1)
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if crop_start > 0:
+                last_hidden_state = last_hidden_state[:, crop_start:]
+                attention_mask = attention_mask[:, crop_start:] if use_attention_mask else None
+        if output_hidden_states:
+            return TextEncoderModelOutput(last_hidden_state, attention_mask, outputs.hidden_states)
+        return TextEncoderModelOutput(last_hidden_state, attention_mask)
+    def forward(
+        self,
+        text,
+        use_attention_mask=None,
+        output_hidden_states=False,
+        do_sample=False,
+        hidden_state_skip_layer=None,
+        return_texts=False,
+    ):
+        batch_encoding = self.text2tokens(text)
+        return self.encode(
+            batch_encoding,
+            use_attention_mask=use_attention_mask,
+            output_hidden_states=output_hidden_states,
+            do_sample=do_sample,
+            hidden_state_skip_layer=hidden_state_skip_layer,
+            return_texts=return_texts,
+        )
+# region HunyanVideo architecture
+def load_text_encoder_1(
+    text_encoder_dir: str, device: torch.device, fp8_llm: bool, dtype: Optional[Union[str, torch.dtype]] = None
+) -> TextEncoder:
+    text_encoder_dtype = dtype or torch.float16
+    text_encoder_type = "llm"
+    text_len = 256
+    hidden_state_skip_layer = 2
+    apply_final_norm = False
+    reproduce = False
+    prompt_template = "dit-llm-encode"
+    prompt_template = PROMPT_TEMPLATE[prompt_template]
+    prompt_template_video = "dit-llm-encode-video"
+    prompt_template_video = PROMPT_TEMPLATE[prompt_template_video]
+    crop_start = prompt_template_video["crop_start"]  # .get("crop_start", 0)
+    max_length = text_len + crop_start
+    text_encoder_1 = TextEncoder(
+        text_encoder_type=text_encoder_type,
+        max_length=max_length,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=text_encoder_dir,
+        tokenizer_type=text_encoder_type,
+        prompt_template=prompt_template,
+        prompt_template_video=prompt_template_video,
+        hidden_state_skip_layer=hidden_state_skip_layer,
+        apply_final_norm=apply_final_norm,
+        reproduce=reproduce,
+    )
+    text_encoder_1.eval()
+    if fp8_llm:
+        org_dtype = text_encoder_1.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder_1.to(device=device, dtype=torch.float8_e4m3fn)
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                return forward
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+        prepare_fp8(text_encoder_1.model, org_dtype)
+    else:
+        text_encoder_1.to(device=device)
+    return text_encoder_1
+def load_text_encoder_2(
+    text_encoder_dir: str, device: torch.device, dtype: Optional[Union[str, torch.dtype]] = None
+) -> TextEncoder:
+    text_encoder_dtype = dtype or torch.float16
+    reproduce = False
+    text_encoder_2_type = "clipL"
+    text_len_2 = 77
+    text_encoder_2 = TextEncoder(
+        text_encoder_type=text_encoder_2_type,
+        max_length=text_len_2,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=text_encoder_dir,
+        tokenizer_type=text_encoder_2_type,
+        reproduce=reproduce,
+    )
+    text_encoder_2.eval()
+    text_encoder_2.to(device=device)
+    return text_encoder_2
+# endregion
+if __name__ == "__main__":
+    import argparse
+    from utils.model_utils import str_to_dtype
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("type", type=str, help="Text Encoder type")
+    parser.add_argument("path1", type=str, help="Text Encoder directory or file 1")
+    parser.add_argument("path2", type=str, help="Text Encoder directory or file 2")
+    parser.add_argument("--dtype", type=str, default=None, help="Data type for Text Encoder")
+    args = parser.parse_args()
+    dtype = str_to_dtype(args.dtype) if args.dtype is not None else torch.float16
+    """
+    if args.type == "clipL":
+        text_encoder_1st = load_clip_l(args.path1, dtype=dtype)
+        tokenizer_1st = load_clip_l_tokenizer(args.path1)
+        text_encoder_2nd = load_clip_l(args.path2, dtype=dtype)
+        tokenizer_2nd = load_clip_l_tokenizer(args.path2)
+    elif args.type == "llm":
+        text_encoder_1st = load_llm(args.path1, dtype=dtype)
+        tokenizer_1st = load_llm_tokenizer(args.path1)
+        text_encoder_2nd = load_llm(args.path2, dtype=dtype)
+        tokenizer_2nd = load_llm_tokenizer(args.path2)
+    print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
+    print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
+    text_encoder_1st.to(device=device)
+    text_encoder_2nd.to(device=device)
+    test_text = "A cat sitting on a table"
+    token_ids_1st = tokenizer_1st(test_text, return_tensors="pt")["input_ids"]
+    token_ids_2nd = tokenizer_2nd(test_text, return_tensors="pt")["input_ids"]
+    assert torch.allclose(token_ids_1st, token_ids_2nd)
+    print(f"Token IDs are the same: {token_ids_1st}")
+    with torch.no_grad():
+        text_encoder_1st_output = text_encoder_1st(token_ids_1st.to(device), output_hidden_states=True)
+        text_encoder_2nd_output = text_encoder_2nd(token_ids_2nd.to(device), output_hidden_states=True)
+    print(f"1st Text Encoder output keys: {text_encoder_1st_output.keys()}")
+    print(f"2nd Text Encoder output keys: {text_encoder_2nd_output.keys()}")
+    for key in text_encoder_1st_output:
+        print(f"Checking output: {key}")
+        assert key in text_encoder_2nd_output, f"Key {key} not in 2nd Text Encoder output"
+        assert torch.allclose(text_encoder_1st_output[key], text_encoder_2nd_output[key])
+        print(f"Outputs are the same: {key}")
+    print("All outputs are the same.")
+    """
+    if args.type == "clipL":
+        text_encoder_1st = load_text_encoder_2(args.path1, device, dtype)
+        text_encoder_2nd = load_text_encoder_2(args.path2, device, dtype)
+    elif args.type == "llm":
+        text_encoder_1st = load_text_encoder_1(args.path1, device, False, dtype)
+        text_encoder_2nd = load_text_encoder_1(args.path2, device, False, dtype)
+    print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
+    print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
+    prompt = "A cat sitting on a table"
+    data_type = "video"  # video only, image is not supported
+    text_inputs_1st = text_encoder_1st.text2tokens(prompt, data_type=data_type)
+    text_inputs_2nd = text_encoder_2nd.text2tokens(prompt, data_type=data_type)
+    print(text_inputs_1st)
+    assert torch.allclose(text_inputs_1st["input_ids"], text_inputs_2nd["input_ids"])
+    with torch.no_grad():
+        prompt_outputs_1st = text_encoder_1st.encode(text_inputs_1st, data_type=data_type)
+        prompt_outputs_2nd = text_encoder_2nd.encode(text_inputs_1st, data_type=data_type)
+    # prompt_outputs.hidden_state, prompt_outputs.attention_mask
+    assert torch.allclose(prompt_outputs_1st.hidden_state, prompt_outputs_2nd.hidden_state)
+    print("Hidden states are the same.")
+    assert torch.allclose(prompt_outputs_1st.attention_mask, prompt_outputs_2nd.attention_mask)
+    print("Attention masks are the same.")
+    print("All outputs are the same.")

hunyuan_model/token_refiner.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from typing import Optional
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .activation_layers import get_activation_layer
+from .attention import attention
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, TextProjection
+from .mlp_layers import MLP
+from .modulate_layers import modulate, apply_gate
+class IndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        self.mlp = MLP(
+            in_channels=hidden_size,
+            hidden_channels=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop_rate,
+            **factory_kwargs,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        # FFN Layer
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+        return x
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class IndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                IndividualTokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads_num=heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    act_type=act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    **factory_kwargs,
+                )
+                for _ in range(depth)
+            ]
+        )
+    def enable_gradient_checkpointing(self):
+        for block in self.blocks:
+            block.enable_gradient_checkpointing()
+    def disable_gradient_checkpointing(self):
+        for block in self.blocks:
+            block.disable_gradient_checkpointing()
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.LongTensor,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        self_attn_mask = None
+        if mask is not None:
+            batch_size = mask.shape[0]
+            seq_len = mask.shape[1]
+            mask = mask.to(x.device)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            # avoids self-attention weight being NaN for padding tokens
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.blocks:
+            x = block(x, c, self_attn_mask)
+        return x
+class SingleTokenRefiner(nn.Module):
+    """
+    A single token refiner block for llm text embedding refine.
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        attn_mode: str = "torch",
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
+        self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        # Build timestep embedding layer
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+        # Build context embedding layer
+        self.c_embedder = TextProjection(in_channels, hidden_size, act_layer, **factory_kwargs)
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            heads_num=heads_num,
+            depth=depth,
+            mlp_width_ratio=mlp_width_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            **factory_kwargs,
+        )
+    def enable_gradient_checkpointing(self):
+        self.individual_token_refiner.enable_gradient_checkpointing()
+    def disable_gradient_checkpointing(self):
+        self.individual_token_refiner.disable_gradient_checkpointing()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.LongTensor,
+        mask: Optional[torch.LongTensor] = None,
+    ):
+        timestep_aware_representations = self.t_embedder(t)
+        if mask is None:
+            context_aware_representations = x.mean(dim=1)
+        else:
+            mask_float = mask.float().unsqueeze(-1)  # [b, s1, 1]
+            context_aware_representations = (x * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+        x = self.input_embedder(x)
+        x = self.individual_token_refiner(x, c, mask)
+        return x

hunyuan_model/vae.py ADDED Viewed

	@@ -0,0 +1,446 @@

+from dataclasses import dataclass
+import json
+from typing import Optional, Tuple, Union
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.attention_processor import SpatialNorm
+from modules.unet_causal_3d_blocks import CausalConv3d, UNetMidBlockCausal3D, get_down_block3d, get_up_block3d
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+SCALING_FACTOR = 0.476986
+VAE_VER = "884-16c-hy"  # We don't support other versions currently
+def load_vae(
+    vae_type: str = "884-16c-hy",
+    vae_dtype: Optional[Union[str, torch.dtype]] = None,
+    sample_size: tuple = None,
+    vae_path: str = None,
+    device=None,
+):
+    """the fucntion to load the 3D VAE model
+    Args:
+        vae_type (str): the type of the 3D VAE model. Defaults to "884-16c-hy".
+        vae_precision (str, optional): the precision to load vae. Defaults to None.
+        sample_size (tuple, optional): the tiling size. Defaults to None.
+        vae_path (str, optional): the path to vae. Defaults to None.
+        logger (_type_, optional): logger. Defaults to None.
+        device (_type_, optional): device to load vae. Defaults to None.
+    """
+    if vae_path is None:
+        vae_path = VAE_PATH[vae_type]
+    logger.info(f"Loading 3D VAE model ({vae_type}) from: {vae_path}")
+    # use fixed config for Hunyuan's VAE
+    CONFIG_JSON = """{
+    "_class_name": "AutoencoderKLCausal3D",
+    "_diffusers_version": "0.4.2",
+    "act_fn": "silu",
+    "block_out_channels": [
+      128,
+      256,
+      512,
+      512
+    ],
+    "down_block_types": [
+      "DownEncoderBlockCausal3D",
+      "DownEncoderBlockCausal3D",
+      "DownEncoderBlockCausal3D",
+      "DownEncoderBlockCausal3D"
+    ],
+    "in_channels": 3,
+    "latent_channels": 16,
+    "layers_per_block": 2,
+    "norm_num_groups": 32,
+    "out_channels": 3,
+    "sample_size": 256,
+    "sample_tsize": 64,
+    "up_block_types": [
+      "UpDecoderBlockCausal3D",
+      "UpDecoderBlockCausal3D",
+      "UpDecoderBlockCausal3D",
+      "UpDecoderBlockCausal3D"
+    ],
+    "scaling_factor": 0.476986,
+    "time_compression_ratio": 4,
+    "mid_block_add_attention": true
+  }"""
+    # config = AutoencoderKLCausal3D.load_config(vae_path)
+    config = json.loads(CONFIG_JSON)
+    # import here to avoid circular import
+    from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+    if sample_size:
+        vae = AutoencoderKLCausal3D.from_config(config, sample_size=sample_size)
+    else:
+        vae = AutoencoderKLCausal3D.from_config(config)
+    # vae_ckpt = Path(vae_path) / "pytorch_model.pt"
+    # assert vae_ckpt.exists(), f"VAE checkpoint not found: {vae_ckpt}"
+    if vae_path.endswith(".safetensors"):
+        from safetensors.torch import load_file
+        ckpt = load_file(vae_path)
+    else:
+        ckpt = torch.load(vae_path, map_location=vae.device, weights_only=True)
+        if "state_dict" in ckpt:
+            ckpt = ckpt["state_dict"]
+    if any(k.startswith("vae.") for k in ckpt.keys()):
+        ckpt = {k.replace("vae.", ""): v for k, v in ckpt.items() if k.startswith("vae.")}
+    vae.load_state_dict(ckpt)
+    spatial_compression_ratio = vae.config.spatial_compression_ratio
+    time_compression_ratio = vae.config.time_compression_ratio
+    if vae_dtype is not None:
+        vae = vae.to(vae_dtype)
+    vae.requires_grad_(False)
+    logger.info(f"VAE to dtype: {vae.dtype}")
+    if device is not None:
+        vae = vae.to(device)
+    vae.eval()
+    return vae, vae_path, spatial_compression_ratio, time_compression_ratio
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+    sample: torch.FloatTensor
+class EncoderCausal3D(nn.Module):
+    r"""
+    The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_downsample_layers = int(np.log2(time_compression_ratio))
+            if time_compression_ratio == 4:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(i >= (len(block_out_channels) - 1 - num_time_downsample_layers) and not is_final_block)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+            downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
+            downsample_stride_T = (2,) if add_time_downsample else (1,)
+            downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
+            down_block = get_down_block3d(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=bool(add_spatial_downsample or add_time_downsample),
+                downsample_stride=downsample_stride,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+        )
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+        sample = self.conv_in(sample)
+        # down
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+        # middle
+        sample = self.mid_block(sample)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class DecoderCausal3D(nn.Module):
+    r"""
+    The `DecoderCausal3D` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        temb_channels = in_channels if norm_type == "spatial" else None
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+        )
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_upsample_layers = int(np.log2(time_compression_ratio))
+            if time_compression_ratio == 4:
+                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+                add_time_upsample = bool(i >= len(block_out_channels) - 1 - num_time_upsample_layers and not is_final_block)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+            upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
+            upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
+            upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
+            up_block = get_up_block3d(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=bool(add_spatial_upsample or add_time_upsample),
+                upsample_scale_factor=upsample_scale_factor,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions."
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, latent_embeds)
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, latent_embeds)
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device, dtype=self.parameters.dtype)
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=reduce_dim,
+                )
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self) -> torch.Tensor:
+        return self.mean

hv_generate_video.py ADDED Viewed

	@@ -0,0 +1,936 @@

+import argparse
+from datetime import datetime
+from pathlib import Path
+import random
+import sys
+import os
+import time
+from typing import Optional, Union
+import numpy as np
+import torch
+import torchvision
+import accelerate
+from diffusers.utils.torch_utils import randn_tensor
+from transformers.models.llama import LlamaModel
+from tqdm import tqdm
+import av
+from einops import rearrange
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+from hunyuan_model import vae
+from hunyuan_model.text_encoder import TextEncoder
+from hunyuan_model.text_encoder import PROMPT_TEMPLATE
+from hunyuan_model.vae import load_vae
+from hunyuan_model.models import load_transformer, get_rotary_pos_embed
+from hunyuan_model.fp8_optimization import convert_fp8_linear
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+from networks import lora
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+from utils.model_utils import str_to_dtype
+from utils.safetensors_utils import mem_eff_save_file
+from dataset.image_video_dataset import load_video, glob_images, resize_image_to_bucket
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def clean_memory_on_device(device):
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    elif device.type == "cpu":
+        pass
+    elif device.type == "mps":  # not tested
+        torch.mps.empty_cache()
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=1, fps=24):
+    """save videos by video tensor
+       copy from https://github.com/guoyww/AnimateDiff/blob/e92bd5671ba62c0d774a32951453e328018b7c5b/animatediff/utils/util.py#L61
+    Args:
+        videos (torch.Tensor): video tensor predicted by the model
+        path (str): path to save video
+        rescale (bool, optional): rescale the video tensor from [-1, 1] to  . Defaults to False.
+        n_rows (int, optional): Defaults to 1.
+        fps (int, optional): video save fps. Defaults to 8.
+    """
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x, 0, 1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    # # save video with av
+    # container = av.open(path, "w")
+    # stream = container.add_stream("libx264", rate=fps)
+    # for x in outputs:
+    #     frame = av.VideoFrame.from_ndarray(x, format="rgb24")
+    #     packet = stream.encode(frame)
+    #     container.mux(packet)
+    # packet = stream.encode(None)
+    # container.mux(packet)
+    # container.close()
+    height, width, _ = outputs[0].shape
+    # create output container
+    container = av.open(path, mode="w")
+    # create video stream
+    codec = "libx264"
+    pixel_format = "yuv420p"
+    stream = container.add_stream(codec, rate=fps)
+    stream.width = width
+    stream.height = height
+    stream.pix_fmt = pixel_format
+    stream.bit_rate = 4000000  # 4Mbit/s
+    for frame_array in outputs:
+        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
+        packets = stream.encode(frame)
+        for packet in packets:
+            container.mux(packet)
+    for packet in stream.encode():
+        container.mux(packet)
+    container.close()
+def save_images_grid(
+    videos: torch.Tensor, parent_dir: str, image_name: str, rescale: bool = False, n_rows: int = 1, create_subdir=True
+):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x, 0, 1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    if create_subdir:
+        output_dir = os.path.join(parent_dir, image_name)
+    else:
+        output_dir = parent_dir
+    os.makedirs(output_dir, exist_ok=True)
+    for i, x in enumerate(outputs):
+        image_path = os.path.join(output_dir, f"{image_name}_{i:03d}.png")
+        image = Image.fromarray(x)
+        image.save(image_path)
+# region Encoding prompt
+def encode_prompt(prompt: Union[str, list[str]], device: torch.device, num_videos_per_prompt: int, text_encoder: TextEncoder):
+    r"""
+    Encodes the prompt into text encoder hidden states.
+    Args:
+        prompt (`str` or `List[str]`):
+            prompt to be encoded
+        device: (`torch.device`):
+            torch device
+        num_videos_per_prompt (`int`):
+            number of videos that should be generated per prompt
+        text_encoder (TextEncoder):
+            text encoder to be used for encoding the prompt
+    """
+    # LoRA and Textual Inversion are not supported in this script
+    # negative prompt and prompt embedding are not supported in this script
+    # clip_skip is not supported in this script because it is not used in the original script
+    data_type = "video"  # video only, image is not supported
+    text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+    with torch.no_grad():
+        prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type, device=device)
+    prompt_embeds = prompt_outputs.hidden_state
+    attention_mask = prompt_outputs.attention_mask
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(device)
+        bs_embed, seq_len = attention_mask.shape
+        attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+        attention_mask = attention_mask.view(bs_embed * num_videos_per_prompt, seq_len)
+    prompt_embeds_dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+    if prompt_embeds.ndim == 2:
+        bs_embed, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+    else:
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+    return prompt_embeds, attention_mask
+def encode_input_prompt(prompt: Union[str, list[str]], args, device, fp8_llm=False, accelerator=None):
+    # constants
+    prompt_template_video = "dit-llm-encode-video"
+    prompt_template = "dit-llm-encode"
+    text_encoder_dtype = torch.float16
+    text_encoder_type = "llm"
+    text_len = 256
+    hidden_state_skip_layer = 2
+    apply_final_norm = False
+    reproduce = False
+    text_encoder_2_type = "clipL"
+    text_len_2 = 77
+    num_videos = 1
+    # if args.prompt_template_video is not None:
+    #     crop_start = PROMPT_TEMPLATE[args.prompt_template_video].get("crop_start", 0)
+    # elif args.prompt_template is not None:
+    #     crop_start = PROMPT_TEMPLATE[args.prompt_template].get("crop_start", 0)
+    # else:
+    #     crop_start = 0
+    crop_start = PROMPT_TEMPLATE[prompt_template_video].get("crop_start", 0)
+    max_length = text_len + crop_start
+    # prompt_template
+    prompt_template = PROMPT_TEMPLATE[prompt_template]
+    # prompt_template_video
+    prompt_template_video = PROMPT_TEMPLATE[prompt_template_video]  # if args.prompt_template_video is not None else None
+    # load text encoders
+    logger.info(f"loading text encoder: {args.text_encoder1}")
+    text_encoder = TextEncoder(
+        text_encoder_type=text_encoder_type,
+        max_length=max_length,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=args.text_encoder1,
+        tokenizer_type=text_encoder_type,
+        prompt_template=prompt_template,
+        prompt_template_video=prompt_template_video,
+        hidden_state_skip_layer=hidden_state_skip_layer,
+        apply_final_norm=apply_final_norm,
+        reproduce=reproduce,
+    )
+    text_encoder.eval()
+    if fp8_llm:
+        org_dtype = text_encoder.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder.to(device=device, dtype=torch.float8_e4m3fn)
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                return forward
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+        prepare_fp8(text_encoder.model, org_dtype)
+    logger.info(f"loading text encoder 2: {args.text_encoder2}")
+    text_encoder_2 = TextEncoder(
+        text_encoder_type=text_encoder_2_type,
+        max_length=text_len_2,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=args.text_encoder2,
+        tokenizer_type=text_encoder_2_type,
+        reproduce=reproduce,
+    )
+    text_encoder_2.eval()
+    # encode prompt
+    logger.info(f"Encoding prompt with text encoder 1")
+    text_encoder.to(device=device)
+    if fp8_llm:
+        with accelerator.autocast():
+            prompt_embeds, prompt_mask = encode_prompt(prompt, device, num_videos, text_encoder)
+    else:
+        prompt_embeds, prompt_mask = encode_prompt(prompt, device, num_videos, text_encoder)
+    text_encoder = None
+    clean_memory_on_device(device)
+    logger.info(f"Encoding prompt with text encoder 2")
+    text_encoder_2.to(device=device)
+    prompt_embeds_2, prompt_mask_2 = encode_prompt(prompt, device, num_videos, text_encoder_2)
+    prompt_embeds = prompt_embeds.to("cpu")
+    prompt_mask = prompt_mask.to("cpu")
+    prompt_embeds_2 = prompt_embeds_2.to("cpu")
+    prompt_mask_2 = prompt_mask_2.to("cpu")
+    text_encoder_2 = None
+    clean_memory_on_device(device)
+    return prompt_embeds, prompt_mask, prompt_embeds_2, prompt_mask_2
+# endregion
+def prepare_vae(args, device):
+    vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
+    vae.eval()
+    # vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+    # set chunk_size to CausalConv3d recursively
+    chunk_size = args.vae_chunk_size
+    if chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(chunk_size)
+        logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")
+    if args.vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+    # elif args.vae_tiling:
+    else:
+        vae.enable_spatial_tiling(True)
+    return vae, vae_dtype
+def encode_to_latents(args, video, device):
+    vae, vae_dtype = prepare_vae(args, device)
+    video = video.to(device=device, dtype=vae_dtype)
+    video = video * 2 - 1  # 0, 1 -> -1, 1
+    with torch.no_grad():
+        latents = vae.encode(video).latent_dist.sample()
+    if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
+        latents = (latents - vae.config.shift_factor) * vae.config.scaling_factor
+    else:
+        latents = latents * vae.config.scaling_factor
+    return latents
+def decode_latents(args, latents, device):
+    vae, vae_dtype = prepare_vae(args, device)
+    expand_temporal_dim = False
+    if len(latents.shape) == 4:
+        latents = latents.unsqueeze(2)
+        expand_temporal_dim = True
+    elif len(latents.shape) == 5:
+        pass
+    else:
+        raise ValueError(f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}.")
+    if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
+        latents = latents / vae.config.scaling_factor + vae.config.shift_factor
+    else:
+        latents = latents / vae.config.scaling_factor
+    latents = latents.to(device=device, dtype=vae_dtype)
+    with torch.no_grad():
+        image = vae.decode(latents, return_dict=False)[0]
+    if expand_temporal_dim:
+        image = image.squeeze(2)
+    image = (image / 2 + 0.5).clamp(0, 1)
+    # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+    image = image.cpu().float()
+    return image
+def parse_args():
+    parser = argparse.ArgumentParser(description="HunyuanVideo inference script")
+    parser.add_argument("--dit", type=str, required=True, help="DiT checkpoint path or directory")
+    parser.add_argument(
+        "--dit_in_channels",
+        type=int,
+        default=None,
+        help="input channels for DiT, default is None (automatically detect). 32 for SkyReels-I2V, 16 for others",
+    )
+    parser.add_argument("--vae", type=str, required=True, help="VAE checkpoint path or directory")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+    parser.add_argument("--exclude_single_blocks", action="store_true", help="Exclude single blocks when loading LoRA weights")
+    # inference
+    parser.add_argument("--prompt", type=str, required=True, help="prompt for generation")
+    parser.add_argument("--negative_prompt", type=str, default=None, help="negative prompt for generation")
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size")
+    parser.add_argument("--video_length", type=int, default=129, help="video length")
+    parser.add_argument("--fps", type=int, default=24, help="video fps")
+    parser.add_argument("--infer_steps", type=int, default=50, help="number of inference steps")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="Guidance scale for classifier free guidance. Default is 1.0 (means no guidance)",
+    )
+    parser.add_argument("--embedded_cfg_scale", type=float, default=6.0, help="Embeded classifier free guidance scale.")
+    parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument(
+        "--image_path", type=str, default=None, help="path to image for image2video inference, only works for SkyReels-I2V model"
+    )
+    parser.add_argument(
+        "--split_uncond",
+        action="store_true",
+        help="split unconditional call for classifier free guidance, slower but less memory usage",
+    )
+    parser.add_argument("--strength", type=float, default=0.8, help="strength for video2video inference")
+    # Flow Matching
+    parser.add_argument("--flow_shift", type=float, default=7.0, help="Shift factor for flow matching schedulers.")
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode", type=str, default="torch", choices=["flash", "torch", "sageattn", "xformers", "sdpa"], help="attention mode"
+    )
+    parser.add_argument(
+        "--split_attn", action="store_true", help="use split attention, default is False. if True, --split_uncond becomes True"
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--blocks_to_swap", type=int, default=None, help="number of blocks to swap in the model")
+    parser.add_argument("--img_in_txt_in_offloading", action="store_true", help="offload img_in and txt_in to cpu")
+    parser.add_argument(
+        "--output_type", type=str, default="video", choices=["video", "images", "latent", "both"], help="output type"
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arthimetic(RTX 4XXX+)")
+    parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    parser.add_argument(
+        "--compile_args",
+        nargs=4,
+        metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+        default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+        help="Torch.compile settings",
+    )
+    args = parser.parse_args()
+    assert (args.latent_path is None or len(args.latent_path) == 0) or (
+        args.output_type == "images" or args.output_type == "video"
+    ), "latent_path is only supported for images or video output"
+    # update dit_weight based on model_base if not exists
+    if args.fp8_fast and not args.fp8:
+        raise ValueError("--fp8_fast requires --fp8")
+    return args
+def check_inputs(args):
+    height = args.video_size[0]
+    width = args.video_size[1]
+    video_length = args.video_length
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    return height, width, video_length
+def main():
+    args = parse_args()
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    dit_dtype = torch.bfloat16
+    dit_weight_dtype = torch.float8_e4m3fn if args.fp8 else dit_dtype
+    logger.info(f"Using device: {device}, DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}")
+    original_base_names = None
+    if args.latent_path is not None and len(args.latent_path) > 0:
+        original_base_names = []
+        latents_list = []
+        seeds = []
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+            seeds.append(seed)
+            latents_list.append(latents)
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+        latents = torch.stack(latents_list, dim=0)
+    else:
+        # prepare accelerator
+        mixed_precision = "bf16" if dit_dtype == torch.bfloat16 else "fp16"
+        accelerator = accelerate.Accelerator(mixed_precision=mixed_precision)
+        # load prompt
+        prompt = args.prompt  # TODO load prompts from file
+        assert prompt is not None, "prompt is required"
+        # check inputs: may be height, width, video_length etc will be changed for each generation in future
+        height, width, video_length = check_inputs(args)
+        # encode prompt with LLM and Text Encoder
+        logger.info(f"Encoding prompt: {prompt}")
+        do_classifier_free_guidance = args.guidance_scale != 1.0
+        if do_classifier_free_guidance:
+            negative_prompt = args.negative_prompt
+            if negative_prompt is None:
+                logger.info("Negative prompt is not provided, using empty prompt")
+                negative_prompt = ""
+            logger.info(f"Encoding negative prompt: {negative_prompt}")
+            prompt = [negative_prompt, prompt]
+        else:
+            if args.negative_prompt is not None:
+                logger.warning("Negative prompt is provided but guidance_scale is 1.0, negative prompt will be ignored.")
+        prompt_embeds, prompt_mask, prompt_embeds_2, prompt_mask_2 = encode_input_prompt(
+            prompt, args, device, args.fp8_llm, accelerator
+        )
+        # encode latents for video2video inference
+        video_latents = None
+        if args.video_path is not None:
+            # v2v inference
+            logger.info(f"Video2Video inference: {args.video_path}")
+            video = load_video(args.video_path, 0, video_length, bucket_reso=(width, height))  # list of frames
+            if len(video) < video_length:
+                raise ValueError(f"Video length is less than {video_length}")
+            video = np.stack(video, axis=0)  # F, H, W, C
+            video = torch.from_numpy(video).permute(3, 0, 1, 2).unsqueeze(0).float()  # 1, C, F, H, W
+            video = video / 255.0
+            logger.info(f"Encoding video to latents")
+            video_latents = encode_to_latents(args, video, device)
+            video_latents = video_latents.to(device=device, dtype=dit_dtype)
+            clean_memory_on_device(device)
+        # encode latents for image2video inference
+        image_latents = None
+        if args.image_path is not None:
+            # i2v inference
+            logger.info(f"Image2Video inference: {args.image_path}")
+            image = Image.open(args.image_path)
+            image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).unsqueeze(2).float()  # 1, C, 1, H, W
+            image = image / 255.0
+            logger.info(f"Encoding image to latents")
+            image_latents = encode_to_latents(args, image, device)  # 1, C, 1, H, W
+            image_latents = image_latents.to(device=device, dtype=dit_dtype)
+            clean_memory_on_device(device)
+        # load DiT model
+        blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
+        loading_device = "cpu"  # if blocks_to_swap > 0 else device
+        logger.info(f"Loading DiT model from {args.dit}")
+        if args.attn_mode == "sdpa":
+            args.attn_mode = "torch"
+        # if image_latents is given, the model should be I2V model, so the in_channels should be 32
+        dit_in_channels = args.dit_in_channels if args.dit_in_channels is not None else (32 if image_latents is not None else 16)
+        # if we use LoRA, weigths should be bf16 instead of fp8, because merging should be done in bf16
+        # the model is too large, so we load the model to cpu. in addition, the .pt file is loaded to cpu anyway
+        # on the fly merging will be a solution for this issue for .safetenors files (not implemented yet)
+        transformer = load_transformer(
+            args.dit, args.attn_mode, args.split_attn, loading_device, dit_dtype, in_channels=dit_in_channels
+        )
+        transformer.eval()
+        # load LoRA weights
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            for i, lora_weight in enumerate(args.lora_weight):
+                if args.lora_multiplier is not None and len(args.lora_multiplier) > i:
+                    lora_multiplier = args.lora_multiplier[i]
+                else:
+                    lora_multiplier = 1.0
+                logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+                weights_sd = load_file(lora_weight)
+                # Filter to exclude keys that are part of single_blocks
+                if args.exclude_single_blocks:
+                    filtered_weights = {k: v for k, v in weights_sd.items() if "single_blocks" not in k}
+                    weights_sd = filtered_weights
+                if args.lycoris:
+                    lycoris_net, _ = create_network_from_weights(
+                        multiplier=lora_multiplier,
+                        file=None,
+                        weights_sd=weights_sd,
+                        unet=transformer,
+                        text_encoder=None,
+                        vae=None,
+                        for_inference=True,
+                    )
+                else:
+                    network = lora.create_arch_network_from_weights(
+                        lora_multiplier, weights_sd, unet=transformer, for_inference=True
+                    )
+                logger.info("Merging LoRA weights to DiT model")
+                # try:
+                #     network.apply_to(None, transformer, apply_text_encoder=False, apply_unet=True)
+                #     info = network.load_state_dict(weights_sd, strict=True)
+                #     logger.info(f"Loaded LoRA weights from {weights_file}: {info}")
+                #     network.eval()
+                #     network.to(device)
+                # except Exception as e:
+                if args.lycoris:
+                    lycoris_net.merge_to(None, transformer, weights_sd, dtype=None, device=device)
+                else:
+                    network.merge_to(None, transformer, weights_sd, device=device, non_blocking=True)
+                synchronize_device(device)
+                logger.info("LoRA weights loaded")
+            # save model here before casting to dit_weight_dtype
+            if args.save_merged_model:
+                logger.info(f"Saving merged model to {args.save_merged_model}")
+                mem_eff_save_file(transformer.state_dict(), args.save_merged_model)  # save_file needs a lot of memory
+                logger.info("Merged model saved")
+                return
+        logger.info(f"Casting model to {dit_weight_dtype}")
+        transformer.to(dtype=dit_weight_dtype)
+        if args.fp8_fast:
+            logger.info("Enabling FP8 acceleration")
+            params_to_keep = {"norm", "bias", "time_in", "vector_in", "guidance_in", "txt_in", "img_in"}
+            for name, param in transformer.named_parameters():
+                dtype_to_use = dit_dtype if any(keyword in name for keyword in params_to_keep) else dit_weight_dtype
+                param.to(dtype=dtype_to_use)
+            convert_fp8_linear(transformer, dit_dtype, params_to_keep=params_to_keep)
+        if args.compile:
+            compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+            logger.info(
+                f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+            )
+            torch._dynamo.config.cache_size_limit = 32
+            for i, block in enumerate(transformer.single_blocks):
+                compiled_block = torch.compile(
+                    block,
+                    backend=compile_backend,
+                    mode=compile_mode,
+                    dynamic=compile_dynamic.lower() in "true",
+                    fullgraph=compile_fullgraph.lower() in "true",
+                )
+                transformer.single_blocks[i] = compiled_block
+            for i, block in enumerate(transformer.double_blocks):
+                compiled_block = torch.compile(
+                    block,
+                    backend=compile_backend,
+                    mode=compile_mode,
+                    dynamic=compile_dynamic.lower() in "true",
+                    fullgraph=compile_fullgraph.lower() in "true",
+                )
+                transformer.double_blocks[i] = compiled_block
+        if blocks_to_swap > 0:
+            logger.info(f"Enable swap {blocks_to_swap} blocks to CPU from device: {device}")
+            transformer.enable_block_swap(blocks_to_swap, device, supports_backward=False)
+            transformer.move_to_device_except_swap_blocks(device)
+            transformer.prepare_block_swap_before_forward()
+        else:
+            logger.info(f"Moving model to {device}")
+            transformer.to(device=device)
+        if args.img_in_txt_in_offloading:
+            logger.info("Enable offloading img_in and txt_in to CPU")
+            transformer.enable_img_in_txt_in_offloading()
+        # load scheduler
+        logger.info(f"Loading scheduler")
+        scheduler = FlowMatchDiscreteScheduler(shift=args.flow_shift, reverse=True, solver="euler")
+        # Prepare timesteps
+        num_inference_steps = args.infer_steps
+        scheduler.set_timesteps(num_inference_steps, device=device)  # n_tokens is not used in FlowMatchDiscreteScheduler
+        timesteps = scheduler.timesteps
+        # Prepare generator
+        num_videos_per_prompt = 1  # args.num_videos # currently only support 1 video per prompt, this is a batch size
+        seed = args.seed
+        if seed is None:
+            seeds = [random.randint(0, 2**32 - 1) for _ in range(num_videos_per_prompt)]
+        elif isinstance(seed, int):
+            seeds = [seed + i for i in range(num_videos_per_prompt)]
+        else:
+            raise ValueError(f"Seed must be an integer or None, got {seed}.")
+        generator = [torch.Generator(device).manual_seed(seed) for seed in seeds]
+        # Prepare noisy latents
+        num_channels_latents = 16  # transformer.config.in_channels
+        vae_scale_factor = 2 ** (4 - 1)  # len(self.vae.config.block_out_channels) == 4
+        vae_ver = vae.VAE_VER
+        if "884" in vae_ver:
+            latent_video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            latent_video_length = (video_length - 1) // 8 + 1
+        else:
+            latent_video_length = video_length
+        # shape = (
+        #     num_videos_per_prompt,
+        #     num_channels_latents,
+        #     latent_video_length,
+        #     height // vae_scale_factor,
+        #     width // vae_scale_factor,
+        # )
+        # latents = randn_tensor(shape, generator=generator, device=device, dtype=dit_dtype)
+        # make first N frames to be the same if the given seed is same
+        shape_of_frame = (num_videos_per_prompt, num_channels_latents, 1, height // vae_scale_factor, width // vae_scale_factor)
+        latents = []
+        for i in range(latent_video_length):
+            latents.append(randn_tensor(shape_of_frame, generator=generator, device=device, dtype=dit_dtype))
+        latents = torch.cat(latents, dim=2)
+        # pad image_latents to match the length of video_latents
+        if image_latents is not None:
+            zero_latents = torch.zeros_like(latents)
+            zero_latents[:, :, :1, :, :] = image_latents
+            image_latents = zero_latents
+        if args.video_path is not None:
+            # v2v inference
+            noise = latents
+            assert noise.shape == video_latents.shape, f"noise shape {noise.shape} != video_latents shape {video_latents.shape}"
+            num_inference_steps = int(num_inference_steps * args.strength)
+            timestep_start = scheduler.timesteps[-num_inference_steps]  # larger strength, less inference steps and more start time
+            t = timestep_start / 1000.0
+            latents = noise * t + video_latents * (1 - t)
+            timesteps = timesteps[-num_inference_steps:]
+            logger.info(f"strength: {args.strength}, num_inference_steps: {num_inference_steps}, timestep_start: {timestep_start}")
+        # FlowMatchDiscreteScheduler does not have init_noise_sigma
+        # Denoising loop
+        embedded_guidance_scale = args.embedded_cfg_scale
+        if embedded_guidance_scale is not None:
+            guidance_expand = torch.tensor([embedded_guidance_scale * 1000.0] * latents.shape[0], dtype=torch.float32, device="cpu")
+            guidance_expand = guidance_expand.to(device=device, dtype=dit_dtype)
+            if do_classifier_free_guidance:
+                guidance_expand = torch.cat([guidance_expand, guidance_expand], dim=0)
+        else:
+            guidance_expand = None
+        freqs_cos, freqs_sin = get_rotary_pos_embed(vae_ver, transformer, video_length, height, width)
+        # n_tokens = freqs_cos.shape[0]
+        # move and cast all inputs to the correct device and dtype
+        prompt_embeds = prompt_embeds.to(device=device, dtype=dit_dtype)
+        prompt_mask = prompt_mask.to(device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(device=device, dtype=dit_dtype)
+        prompt_mask_2 = prompt_mask_2.to(device=device)
+        freqs_cos = freqs_cos.to(device=device, dtype=dit_dtype)
+        freqs_sin = freqs_sin.to(device=device, dtype=dit_dtype)
+        num_warmup_steps = len(timesteps) - num_inference_steps * scheduler.order  # this should be 0 in v2v inference
+        # assert split_uncond and split_attn
+        if args.split_attn and do_classifier_free_guidance and not args.split_uncond:
+            logger.warning("split_attn is enabled, split_uncond will be enabled as well.")
+            args.split_uncond = True
+        # with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]) as p:
+        with tqdm(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents = scheduler.scale_model_input(latents, t)
+                # predict the noise residual
+                with torch.no_grad(), accelerator.autocast():
+                    latents_input = latents if not do_classifier_free_guidance else torch.cat([latents, latents], dim=0)
+                    if image_latents is not None:
+                        latents_image_input = (
+                            image_latents if not do_classifier_free_guidance else torch.cat([image_latents, image_latents], dim=0)
+                        )
+                        latents_input = torch.cat([latents_input, latents_image_input], dim=1)  # 1 or 2, C*2, F, H, W
+                    batch_size = 1 if args.split_uncond else latents_input.shape[0]
+                    noise_pred_list = []
+                    for j in range(0, latents_input.shape[0], batch_size):
+                        noise_pred = transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                            latents_input[j : j + batch_size],  # [1, 16, 33, 24, 42]
+                            t.repeat(batch_size).to(device=device, dtype=dit_dtype),  # [1]
+                            text_states=prompt_embeds[j : j + batch_size],  # [1, 256, 4096]
+                            text_mask=prompt_mask[j : j + batch_size],  # [1, 256]
+                            text_states_2=prompt_embeds_2[j : j + batch_size],  # [1, 768]
+                            freqs_cos=freqs_cos,  # [seqlen, head_dim]
+                            freqs_sin=freqs_sin,  # [seqlen, head_dim]
+                            guidance=guidance_expand[j : j + batch_size],  # [1]
+                            return_dict=True,
+                        )["x"]
+                        noise_pred_list.append(noise_pred)
+                    noise_pred = torch.cat(noise_pred_list, dim=0)
+                # perform classifier free guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + args.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # # SkyReels' rescale noise config is omitted for now
+                    # if guidance_rescale > 0.0:
+                    #     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    #     noise_pred = rescale_noise_cfg(
+                    #         noise_pred,
+                    #         noise_pred_cond,
+                    #         guidance_rescale=self.guidance_rescale,
+                    #     )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                # update progress bar
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % scheduler.order == 0):
+                    if progress_bar is not None:
+                        progress_bar.update()
+        # print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
+        # print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+        latents = latents.detach().cpu()
+        transformer = None
+        clean_memory_on_device(device)
+    # Save samples
+    output_type = args.output_type
+    save_path = args.save_path  # if args.save_path_suffix == "" else f"{args.save_path}_{args.save_path_suffix}"
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    if output_type == "latent" or output_type == "both":
+        # save latent
+        for i, latent in enumerate(latents):
+            latent_path = f"{save_path}/{time_flag}_{i}_{seeds[i]}_latent.safetensors"
+            if args.no_metadata:
+                metadata = None
+            else:
+                metadata = {
+                    "seeds": f"{seeds[i]}",
+                    "prompt": f"{args.prompt}",
+                    "height": f"{height}",
+                    "width": f"{width}",
+                    "video_length": f"{video_length}",
+                    "infer_steps": f"{num_inference_steps}",
+                    "guidance_scale": f"{args.guidance_scale}",
+                    "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
+                }
+                if args.negative_prompt is not None:
+                    metadata["negative_prompt"] = f"{args.negative_prompt}"
+            sd = {"latent": latent}
+            save_file(sd, latent_path, metadata=metadata)
+            logger.info(f"Latent save to: {latent_path}")
+    if output_type == "video" or output_type == "both":
+        # save video
+        videos = decode_latents(args, latents, device)
+        for i, sample in enumerate(videos):
+            original_name = "" if original_base_names is None else f"_{original_base_names[i]}"
+            sample = sample.unsqueeze(0)
+            video_path = f"{save_path}/{time_flag}_{i}_{seeds[i]}{original_name}.mp4"
+            save_videos_grid(sample, video_path, fps=args.fps)
+            logger.info(f"Sample save to: {video_path}")
+    elif output_type == "images":
+        # save images
+        videos = decode_latents(args, latents, device)
+        for i, sample in enumerate(videos):
+            original_name = "" if original_base_names is None else f"_{original_base_names[i]}"
+            sample = sample.unsqueeze(0)
+            image_name = f"{time_flag}_{i}_{seeds[i]}{original_name}"
+            save_images_grid(sample, save_path, image_name)
+            logger.info(f"Sample images save to: {save_path}/{image_name}")
+    logger.info("Done!")
+if __name__ == "__main__":
+    main()

merge_lora.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import argparse
+import logging
+import torch
+from safetensors.torch import load_file
+from networks import lora
+from utils.safetensors_utils import mem_eff_save_file
+from hunyuan_model.models import load_transformer
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def parse_args():
+    parser = argparse.ArgumentParser(description="HunyuanVideo model merger script")
+    parser.add_argument("--dit", type=str, required=True, help="DiT checkpoint path or directory")
+    parser.add_argument("--dit_in_channels", type=int, default=16, help="input channels for DiT, default is 16, skyreels I2V is 32")
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=[1.0], help="LoRA multiplier (can specify multiple values)")
+    parser.add_argument("--save_merged_model", type=str, required=True, help="Path to save the merged model")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use for merging")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    device = torch.device(args.device)
+    logger.info(f"Using device: {device}")
+    # Load DiT model
+    logger.info(f"Loading DiT model from {args.dit}")
+    transformer = load_transformer(args.dit, "torch", False, "cpu", torch.bfloat16, in_channels=args.dit_in_channels)
+    transformer.eval()
+    # Load LoRA weights and merge
+    if args.lora_weight is not None and len(args.lora_weight) > 0:
+        for i, lora_weight in enumerate(args.lora_weight):
+            # Use the corresponding lora_multiplier or default to 1.0
+            if args.lora_multiplier is not None and len(args.lora_multiplier) > i:
+                lora_multiplier = args.lora_multiplier[i]
+            else:
+                lora_multiplier = 1.0
+            logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+            weights_sd = load_file(lora_weight)
+            network = lora.create_arch_network_from_weights(
+                lora_multiplier, weights_sd, unet=transformer, for_inference=True
+            )
+            logger.info("Merging LoRA weights to DiT model")
+            network.merge_to(None, transformer, weights_sd, device=device, non_blocking=True)
+            logger.info("LoRA weights loaded")
+    # Save the merged model
+    logger.info(f"Saving merged model to {args.save_merged_model}")
+    mem_eff_save_file(transformer.state_dict(), args.save_merged_model)
+    logger.info("Merged model saved")
+if __name__ == "__main__":
+    main()

modules/__init__.py ADDED Viewed

File without changes