build-tools / diffusers /modular_pipelines /wan /modular_blocks_wan.py
salmankhanpm's picture
Add files using upload-large-folder tool
4f4376a verified
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
WanPrepareLatentsStep,
WanSetTimestepsStep,
WanTextInputStep,
)
from .decoders import WanVaeDecoderStep
from .denoise import (
WanDenoiseStep,
)
from .encoders import (
WanTextEncoderStep,
)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# ====================
# 1. DENOISE
# ====================
# inputs(text) -> set_timesteps -> prepare_latents -> denoise
# auto_docstring
class WanCoreDenoiseStep(SequentialPipelineBlocks):
"""
denoise block that takes encoded conditions and runs the denoising process.
Components:
transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
Inputs:
num_videos_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
prompt_embeds (`Tensor`):
Pre-generated text embeddings. Can be generated from text_encoder step.
negative_prompt_embeds (`Tensor`, *optional*):
Pre-generated negative text embeddings. Can be generated from text_encoder step.
num_inference_steps (`None`, *optional*, defaults to 50):
TODO: Add description.
timesteps (`None`, *optional*):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
height (`int`, *optional*):
TODO: Add description.
width (`int`, *optional*):
TODO: Add description.
num_frames (`int`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
attention_kwargs (`None`, *optional*):
TODO: Add description.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "wan"
block_classes = [
WanTextInputStep,
WanSetTimestepsStep,
WanPrepareLatentsStep,
WanDenoiseStep,
]
block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
@property
def description(self):
return "denoise block that takes encoded conditions and runs the denoising process."
@property
def outputs(self):
return [OutputParam.template("latents")]
# ====================
# 2. BLOCKS (Wan2.1 text2video)
# ====================
# auto_docstring
class WanBlocks(SequentialPipelineBlocks):
"""
Modular pipeline blocks for Wan2.1.
Components:
text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
(`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) vae (`AutoencoderKLWan`) video_processor
(`VideoProcessor`)
Inputs:
prompt (`None`, *optional*):
TODO: Add description.
negative_prompt (`None`, *optional*):
TODO: Add description.
max_sequence_length (`None`, *optional*, defaults to 512):
TODO: Add description.
num_videos_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
num_inference_steps (`None`, *optional*, defaults to 50):
TODO: Add description.
timesteps (`None`, *optional*):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
height (`int`, *optional*):
TODO: Add description.
width (`int`, *optional*):
TODO: Add description.
num_frames (`int`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
attention_kwargs (`None`, *optional*):
TODO: Add description.
output_type (`str`, *optional*, defaults to np):
The output type of the decoded videos
Outputs:
videos (`list`):
The generated videos.
"""
model_name = "wan"
block_classes = [
WanTextEncoderStep,
WanCoreDenoiseStep,
WanVaeDecoderStep,
]
block_names = ["text_encoder", "denoise", "decode"]
@property
def description(self):
return "Modular pipeline blocks for Wan2.1."
@property
def outputs(self):
return [OutputParam.template("videos")]