# Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import OutputParam from .before_denoise import ( WanAdditionalInputsStep, WanPrepareLatentsStep, WanSetTimestepsStep, WanTextInputStep, ) from .decoders import WanVaeDecoderStep from .denoise import ( WanImage2VideoDenoiseStep, ) from .encoders import ( WanFirstLastFrameImageEncoderStep, WanFirstLastFrameVaeEncoderStep, WanImageCropResizeStep, WanImageEncoderStep, WanImageResizeStep, WanPrepareFirstFrameLatentsStep, WanPrepareFirstLastFrameLatentsStep, WanTextEncoderStep, WanVaeEncoderStep, ) logger = logging.get_logger(__name__) # pylint: disable=invalid-name # ==================== # 1. IMAGE ENCODER # ==================== # wan2.1 I2V (first frame only) # auto_docstring class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks): """ Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings Components: image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) Inputs: image (`Image`): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. Outputs: resized_image (`Image`): TODO: Add description. image_embeds (`Tensor`): The image embeddings """ model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanImageEncoderStep] block_names = ["image_resize", "image_encoder"] @property def description(self): return "Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings" # wan2.1 FLF2V (first and last frame) # auto_docstring class WanFLF2VImageEncoderStep(SequentialPipelineBlocks): """ FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image embeddings Components: image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) Inputs: image (`Image`): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. last_image (`Image`): The last frameimage Outputs: resized_image (`Image`): TODO: Add description. resized_last_image (`Image`): TODO: Add description. image_embeds (`Tensor`): The image embeddings """ model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep] block_names = ["image_resize", "last_image_resize", "image_encoder"] @property def description(self): return "FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image embeddings" # wan2.1 Auto Image Encoder # auto_docstring class WanAutoImageEncoderStep(AutoPipelineBlocks): """ Image Encoder step that encode the image to generate the image embeddingsThis is an auto pipeline block that works for image2video tasks. - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided. - `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is not provided, step will be skipped. Components: image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) Inputs: image (`Image`, *optional*): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. last_image (`Image`, *optional*): The last frameimage Outputs: resized_image (`Image`): TODO: Add description. resized_last_image (`Image`): TODO: Add description. image_embeds (`Tensor`): The image embeddings """ block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep] block_names = ["flf2v_image_encoder", "image2video_image_encoder"] block_trigger_inputs = ["last_image", "image"] model_name = "wan-i2v" @property def description(self): return ( "Image Encoder step that encode the image to generate the image embeddings" + "This is an auto pipeline block that works for image2video tasks." + " - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided." + " - `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided." + " - if `last_image` or `image` is not provided, step will be skipped." ) # ==================== # 2. VAE ENCODER # ==================== # wan2.1 I2V (first frame only) # auto_docstring class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks): """ Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent representation Components: vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) Inputs: image (`Image`): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. num_frames (`int`, *optional*, defaults to 81): TODO: Add description. generator (`None`, *optional*): TODO: Add description. Outputs: resized_image (`Image`): TODO: Add description. first_frame_latents (`Tensor`): video latent representation with the first frame image condition image_condition_latents (`Tensor | NoneType`): TODO: Add description. """ model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep] block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"] @property def description(self): return "Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent representation" # wan2.1 FLF2V (first and last frame) # auto_docstring class WanFLF2VVaeEncoderStep(SequentialPipelineBlocks): """ FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the latent conditions Components: vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) Inputs: image (`Image`): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. last_image (`Image`): The last frameimage num_frames (`int`, *optional*, defaults to 81): TODO: Add description. generator (`None`, *optional*): TODO: Add description. Outputs: resized_image (`Image`): TODO: Add description. resized_last_image (`Image`): TODO: Add description. first_last_frame_latents (`Tensor`): video latent representation with the first and last frame images condition image_condition_latents (`Tensor | NoneType`): TODO: Add description. """ model_name = "wan-i2v" block_classes = [ WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameVaeEncoderStep, WanPrepareFirstLastFrameLatentsStep, ] block_names = ["image_resize", "last_image_resize", "vae_encoder", "prepare_first_last_frame_latents"] @property def description(self): return "FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the latent conditions" # wan2.1 Auto Vae Encoder # auto_docstring class WanAutoVaeEncoderStep(AutoPipelineBlocks): """ Vae Image Encoder step that encode the image to generate the image latentsThis is an auto pipeline block that works for image2video tasks. - `WanFLF2VVaeEncoderStep` (flf2v) is used when `last_image` is provided. - `WanImage2VideoVaeEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is not provided, step will be skipped. Components: vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) Inputs: image (`Image`, *optional*): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. last_image (`Image`, *optional*): The last frameimage num_frames (`int`, *optional*, defaults to 81): TODO: Add description. generator (`None`, *optional*): TODO: Add description. Outputs: resized_image (`Image`): TODO: Add description. resized_last_image (`Image`): TODO: Add description. first_last_frame_latents (`Tensor`): video latent representation with the first and last frame images condition image_condition_latents (`Tensor | NoneType`): TODO: Add description. first_frame_latents (`Tensor`): video latent representation with the first frame image condition """ model_name = "wan-i2v" block_classes = [WanFLF2VVaeEncoderStep, WanImage2VideoVaeEncoderStep] block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"] block_trigger_inputs = ["last_image", "image"] @property def description(self): return ( "Vae Image Encoder step that encode the image to generate the image latents" + "This is an auto pipeline block that works for image2video tasks." + " - `WanFLF2VVaeEncoderStep` (flf2v) is used when `last_image` is provided." + " - `WanImage2VideoVaeEncoderStep` (image2video) is used when `image` is provided." + " - if `last_image` or `image` is not provided, step will be skipped." ) # ==================== # 3. DENOISE (inputs -> set_timesteps -> prepare_latents -> denoise) # ==================== # wan2.1 I2V core denoise (support both I2V and FLF2V) # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents) # auto_docstring class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): """ denoise block that takes encoded text and image latent conditions and runs the denoising process. Components: transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`) Inputs: num_videos_per_prompt (`None`, *optional*, defaults to 1): TODO: Add description. prompt_embeds (`Tensor`): Pre-generated text embeddings. Can be generated from text_encoder step. negative_prompt_embeds (`Tensor`, *optional*): Pre-generated negative text embeddings. Can be generated from text_encoder step. height (`None`, *optional*): TODO: Add description. width (`None`, *optional*): TODO: Add description. num_frames (`None`, *optional*): TODO: Add description. image_condition_latents (`None`, *optional*): TODO: Add description. num_inference_steps (`None`, *optional*, defaults to 50): TODO: Add description. timesteps (`None`, *optional*): TODO: Add description. sigmas (`None`, *optional*): TODO: Add description. latents (`Tensor | NoneType`, *optional*): TODO: Add description. generator (`None`, *optional*): TODO: Add description. attention_kwargs (`None`, *optional*): TODO: Add description. image_embeds (`Tensor`): TODO: Add description. Outputs: batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt dtype (`dtype`): Data type of model tensor inputs (determined by `transformer.dtype`) latents (`Tensor`): The initial latents to use for the denoising process """ model_name = "wan-i2v" block_classes = [ WanTextInputStep, WanAdditionalInputsStep(image_latent_inputs=["image_condition_latents"]), WanSetTimestepsStep, WanPrepareLatentsStep, WanImage2VideoDenoiseStep, ] block_names = [ "input", "additional_inputs", "set_timesteps", "prepare_latents", "denoise", ] @property def description(self): return "denoise block that takes encoded text and image latent conditions and runs the denoising process." # ==================== # 4. BLOCKS (Wan2.1 Image2Video) # ==================== # wan2.1 Image2Video Auto Blocks # auto_docstring class WanImage2VideoAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for image-to-video using Wan. Supported workflows: - `image2video`: requires `image`, `prompt` - `flf2v`: requires `last_image`, `image`, `prompt` Components: text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) Inputs: prompt (`None`, *optional*): TODO: Add description. negative_prompt (`None`, *optional*): TODO: Add description. max_sequence_length (`None`, *optional*, defaults to 512): TODO: Add description. image (`Image`, *optional*): TODO: Add description. height (`int`, *optional*, defaults to 480): TODO: Add description. width (`int`, *optional*, defaults to 832): TODO: Add description. last_image (`Image`, *optional*): The last frameimage num_frames (`int`, *optional*, defaults to 81): TODO: Add description. generator (`None`, *optional*): TODO: Add description. num_videos_per_prompt (`None`, *optional*, defaults to 1): TODO: Add description. image_condition_latents (`None`, *optional*): TODO: Add description. num_inference_steps (`None`, *optional*, defaults to 50): TODO: Add description. timesteps (`None`, *optional*): TODO: Add description. sigmas (`None`, *optional*): TODO: Add description. latents (`Tensor | NoneType`, *optional*): TODO: Add description. attention_kwargs (`None`, *optional*): TODO: Add description. image_embeds (`Tensor`): TODO: Add description. output_type (`str`, *optional*, defaults to np): The output type of the decoded videos Outputs: videos (`list`): The generated videos. """ model_name = "wan-i2v" block_classes = [ WanTextEncoderStep, WanAutoImageEncoderStep, WanAutoVaeEncoderStep, WanImage2VideoCoreDenoiseStep, WanVaeDecoderStep, ] block_names = [ "text_encoder", "image_encoder", "vae_encoder", "denoise", "decode", ] _workflow_map = { "image2video": {"image": True, "prompt": True}, "flf2v": {"last_image": True, "image": True, "prompt": True}, } @property def description(self): return "Auto Modular pipeline for image-to-video using Wan." @property def outputs(self): return [OutputParam.template("videos")]