# Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import OutputParam from .before_denoise import ( WanPrepareLatentsStep, WanSetTimestepsStep, WanTextInputStep, ) from .decoders import WanVaeDecoderStep from .denoise import ( WanDenoiseStep, ) from .encoders import ( WanTextEncoderStep, ) logger = logging.get_logger(__name__) # pylint: disable=invalid-name # ==================== # 1. DENOISE # ==================== # inputs(text) -> set_timesteps -> prepare_latents -> denoise # auto_docstring class WanCoreDenoiseStep(SequentialPipelineBlocks): """ denoise block that takes encoded conditions and runs the denoising process. Components: transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`) Inputs: num_videos_per_prompt (`None`, *optional*, defaults to 1): TODO: Add description. prompt_embeds (`Tensor`): Pre-generated text embeddings. Can be generated from text_encoder step. negative_prompt_embeds (`Tensor`, *optional*): Pre-generated negative text embeddings. Can be generated from text_encoder step. num_inference_steps (`None`, *optional*, defaults to 50): TODO: Add description. timesteps (`None`, *optional*): TODO: Add description. sigmas (`None`, *optional*): TODO: Add description. height (`int`, *optional*): TODO: Add description. width (`int`, *optional*): TODO: Add description. num_frames (`int`, *optional*): TODO: Add description. latents (`Tensor | NoneType`, *optional*): TODO: Add description. generator (`None`, *optional*): TODO: Add description. attention_kwargs (`None`, *optional*): TODO: Add description. Outputs: latents (`Tensor`): Denoised latents. """ model_name = "wan" block_classes = [ WanTextInputStep, WanSetTimestepsStep, WanPrepareLatentsStep, WanDenoiseStep, ] block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] @property def description(self): return "denoise block that takes encoded conditions and runs the denoising process." @property def outputs(self): return [OutputParam.template("latents")] # ==================== # 2. BLOCKS (Wan2.1 text2video) # ==================== # auto_docstring class WanBlocks(SequentialPipelineBlocks): """ Modular pipeline blocks for Wan2.1. Components: text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) Inputs: prompt (`None`, *optional*): TODO: Add description. negative_prompt (`None`, *optional*): TODO: Add description. max_sequence_length (`None`, *optional*, defaults to 512): TODO: Add description. num_videos_per_prompt (`None`, *optional*, defaults to 1): TODO: Add description. num_inference_steps (`None`, *optional*, defaults to 50): TODO: Add description. timesteps (`None`, *optional*): TODO: Add description. sigmas (`None`, *optional*): TODO: Add description. height (`int`, *optional*): TODO: Add description. width (`int`, *optional*): TODO: Add description. num_frames (`int`, *optional*): TODO: Add description. latents (`Tensor | NoneType`, *optional*): TODO: Add description. generator (`None`, *optional*): TODO: Add description. attention_kwargs (`None`, *optional*): TODO: Add description. output_type (`str`, *optional*, defaults to np): The output type of the decoded videos Outputs: videos (`list`): The generated videos. """ model_name = "wan" block_classes = [ WanTextEncoderStep, WanCoreDenoiseStep, WanVaeDecoderStep, ] block_names = ["text_encoder", "denoise", "decode"] @property def description(self): return "Modular pipeline blocks for Wan2.1." @property def outputs(self): return [OutputParam.template("videos")]