Spaces:
Running on Zero
Running on Zero
| # Copyright 2025 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from ...utils import logging | |
| from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks | |
| from ..modular_pipeline_utils import InsertableDict | |
| from .before_denoise import ( | |
| WanAdditionalInputsStep, | |
| WanPrepareFirstFrameLatentsStep, | |
| WanPrepareFirstLastFrameLatentsStep, | |
| WanPrepareLatentsStep, | |
| WanSetTimestepsStep, | |
| WanTextInputStep, | |
| ) | |
| from .decoders import WanImageVaeDecoderStep | |
| from .denoise import ( | |
| Wan22DenoiseStep, | |
| Wan22Image2VideoDenoiseStep, | |
| WanDenoiseStep, | |
| WanFLF2VDenoiseStep, | |
| WanImage2VideoDenoiseStep, | |
| ) | |
| from .encoders import ( | |
| WanFirstLastFrameImageEncoderStep, | |
| WanFirstLastFrameVaeImageEncoderStep, | |
| WanImageCropResizeStep, | |
| WanImageEncoderStep, | |
| WanImageResizeStep, | |
| WanTextEncoderStep, | |
| WanVaeImageEncoderStep, | |
| ) | |
| logger = logging.get_logger(__name__) # pylint: disable=invalid-name | |
| # wan2.1 | |
| # wan2.1: text2vid | |
| class WanCoreDenoiseStep(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextInputStep, | |
| WanSetTimestepsStep, | |
| WanPrepareLatentsStep, | |
| WanDenoiseStep, | |
| ] | |
| block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] | |
| def description(self): | |
| return ( | |
| "denoise block that takes encoded conditions and runs the denoising process.\n" | |
| + "This is a sequential pipeline blocks:\n" | |
| + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" | |
| + " - `WanSetTimestepsStep` is used to set the timesteps\n" | |
| + " - `WanPrepareLatentsStep` is used to prepare the latents\n" | |
| + " - `WanDenoiseStep` is used to denoise the latents\n" | |
| ) | |
| # wan2.1: image2video | |
| ## image encoder | |
| class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks): | |
| model_name = "wan" | |
| block_classes = [WanImageResizeStep, WanImageEncoderStep] | |
| block_names = ["image_resize", "image_encoder"] | |
| def description(self): | |
| return "Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings" | |
| ## vae encoder | |
| class WanImage2VideoVaeImageEncoderStep(SequentialPipelineBlocks): | |
| model_name = "wan" | |
| block_classes = [WanImageResizeStep, WanVaeImageEncoderStep] | |
| block_names = ["image_resize", "vae_encoder"] | |
| def description(self): | |
| return "Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent representation" | |
| ## denoise | |
| class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextInputStep, | |
| WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"]), | |
| WanSetTimestepsStep, | |
| WanPrepareLatentsStep, | |
| WanPrepareFirstFrameLatentsStep, | |
| WanImage2VideoDenoiseStep, | |
| ] | |
| block_names = [ | |
| "input", | |
| "additional_inputs", | |
| "set_timesteps", | |
| "prepare_latents", | |
| "prepare_first_frame_latents", | |
| "denoise", | |
| ] | |
| def description(self): | |
| return ( | |
| "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n" | |
| + "This is a sequential pipeline blocks:\n" | |
| + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" | |
| + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n" | |
| + " - `WanSetTimestepsStep` is used to set the timesteps\n" | |
| + " - `WanPrepareLatentsStep` is used to prepare the latents\n" | |
| + " - `WanPrepareFirstFrameLatentsStep` is used to prepare the first frame latent conditions\n" | |
| + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n" | |
| ) | |
| # wan2.1: FLF2v | |
| ## image encoder | |
| class WanFLF2VImageEncoderStep(SequentialPipelineBlocks): | |
| model_name = "wan" | |
| block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep] | |
| block_names = ["image_resize", "last_image_resize", "image_encoder"] | |
| def description(self): | |
| return "FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image embeddings" | |
| ## vae encoder | |
| class WanFLF2VVaeImageEncoderStep(SequentialPipelineBlocks): | |
| model_name = "wan" | |
| block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameVaeImageEncoderStep] | |
| block_names = ["image_resize", "last_image_resize", "vae_encoder"] | |
| def description(self): | |
| return "FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the latent conditions" | |
| ## denoise | |
| class WanFLF2VCoreDenoiseStep(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextInputStep, | |
| WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"]), | |
| WanSetTimestepsStep, | |
| WanPrepareLatentsStep, | |
| WanPrepareFirstLastFrameLatentsStep, | |
| WanFLF2VDenoiseStep, | |
| ] | |
| block_names = [ | |
| "input", | |
| "additional_inputs", | |
| "set_timesteps", | |
| "prepare_latents", | |
| "prepare_first_last_frame_latents", | |
| "denoise", | |
| ] | |
| def description(self): | |
| return ( | |
| "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n" | |
| + "This is a sequential pipeline blocks:\n" | |
| + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" | |
| + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n" | |
| + " - `WanSetTimestepsStep` is used to set the timesteps\n" | |
| + " - `WanPrepareLatentsStep` is used to prepare the latents\n" | |
| + " - `WanPrepareFirstLastFrameLatentsStep` is used to prepare the latent conditions\n" | |
| + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n" | |
| ) | |
| # wan2.1: auto blocks | |
| ## image encoder | |
| class WanAutoImageEncoderStep(AutoPipelineBlocks): | |
| block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep] | |
| block_names = ["flf2v_image_encoder", "image2video_image_encoder"] | |
| block_trigger_inputs = ["last_image", "image"] | |
| def description(self): | |
| return ( | |
| "Image Encoder step that encode the image to generate the image embeddings" | |
| + "This is an auto pipeline block that works for image2video tasks." | |
| + " - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided." | |
| + " - `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided." | |
| + " - if `last_image` or `image` is not provided, step will be skipped." | |
| ) | |
| ## vae encoder | |
| class WanAutoVaeImageEncoderStep(AutoPipelineBlocks): | |
| block_classes = [WanFLF2VVaeImageEncoderStep, WanImage2VideoVaeImageEncoderStep] | |
| block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"] | |
| block_trigger_inputs = ["last_image", "image"] | |
| def description(self): | |
| return ( | |
| "Vae Image Encoder step that encode the image to generate the image latents" | |
| + "This is an auto pipeline block that works for image2video tasks." | |
| + " - `WanFLF2VVaeImageEncoderStep` (flf2v) is used when `last_image` is provided." | |
| + " - `WanImage2VideoVaeImageEncoderStep` (image2video) is used when `image` is provided." | |
| + " - if `last_image` or `image` is not provided, step will be skipped." | |
| ) | |
| ## denoise | |
| class WanAutoDenoiseStep(AutoPipelineBlocks): | |
| block_classes = [ | |
| WanFLF2VCoreDenoiseStep, | |
| WanImage2VideoCoreDenoiseStep, | |
| WanCoreDenoiseStep, | |
| ] | |
| block_names = ["flf2v", "image2video", "text2video"] | |
| block_trigger_inputs = ["first_last_frame_latents", "first_frame_latents", None] | |
| def description(self) -> str: | |
| return ( | |
| "Denoise step that iteratively denoise the latents. " | |
| "This is a auto pipeline block that works for text2video and image2video tasks." | |
| " - `WanCoreDenoiseStep` (text2video) for text2vid tasks." | |
| " - `WanCoreImage2VideoCoreDenoiseStep` (image2video) for image2video tasks." | |
| + " - if `first_frame_latents` is provided, `WanCoreImage2VideoDenoiseStep` will be used.\n" | |
| + " - if `first_frame_latents` is not provided, `WanCoreDenoiseStep` will be used.\n" | |
| ) | |
| # auto pipeline blocks | |
| class WanAutoBlocks(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextEncoderStep, | |
| WanAutoImageEncoderStep, | |
| WanAutoVaeImageEncoderStep, | |
| WanAutoDenoiseStep, | |
| WanImageVaeDecoderStep, | |
| ] | |
| block_names = [ | |
| "text_encoder", | |
| "image_encoder", | |
| "vae_encoder", | |
| "denoise", | |
| "decode", | |
| ] | |
| def description(self): | |
| return ( | |
| "Auto Modular pipeline for text-to-video using Wan.\n" | |
| + "- for text-to-video generation, all you need to provide is `prompt`" | |
| ) | |
| # wan22 | |
| # wan2.2: text2vid | |
| ## denoise | |
| class Wan22CoreDenoiseStep(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextInputStep, | |
| WanSetTimestepsStep, | |
| WanPrepareLatentsStep, | |
| Wan22DenoiseStep, | |
| ] | |
| block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] | |
| def description(self): | |
| return ( | |
| "denoise block that takes encoded conditions and runs the denoising process.\n" | |
| + "This is a sequential pipeline blocks:\n" | |
| + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" | |
| + " - `WanSetTimestepsStep` is used to set the timesteps\n" | |
| + " - `WanPrepareLatentsStep` is used to prepare the latents\n" | |
| + " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n" | |
| ) | |
| # wan2.2: image2video | |
| ## denoise | |
| class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextInputStep, | |
| WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"]), | |
| WanSetTimestepsStep, | |
| WanPrepareLatentsStep, | |
| WanPrepareFirstFrameLatentsStep, | |
| Wan22Image2VideoDenoiseStep, | |
| ] | |
| block_names = [ | |
| "input", | |
| "additional_inputs", | |
| "set_timesteps", | |
| "prepare_latents", | |
| "prepare_first_frame_latents", | |
| "denoise", | |
| ] | |
| def description(self): | |
| return ( | |
| "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n" | |
| + "This is a sequential pipeline blocks:\n" | |
| + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" | |
| + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n" | |
| + " - `WanSetTimestepsStep` is used to set the timesteps\n" | |
| + " - `WanPrepareLatentsStep` is used to prepare the latents\n" | |
| + " - `WanPrepareFirstFrameLatentsStep` is used to prepare the first frame latent conditions\n" | |
| + " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n" | |
| ) | |
| class Wan22AutoDenoiseStep(AutoPipelineBlocks): | |
| block_classes = [ | |
| Wan22Image2VideoCoreDenoiseStep, | |
| Wan22CoreDenoiseStep, | |
| ] | |
| block_names = ["image2video", "text2video"] | |
| block_trigger_inputs = ["first_frame_latents", None] | |
| def description(self) -> str: | |
| return ( | |
| "Denoise step that iteratively denoise the latents. " | |
| "This is a auto pipeline block that works for text2video and image2video tasks." | |
| " - `Wan22Image2VideoCoreDenoiseStep` (image2video) for image2video tasks." | |
| " - `Wan22CoreDenoiseStep` (text2video) for text2vid tasks." | |
| + " - if `first_frame_latents` is provided, `Wan22Image2VideoCoreDenoiseStep` will be used.\n" | |
| + " - if `first_frame_latents` is not provided, `Wan22CoreDenoiseStep` will be used.\n" | |
| ) | |
| class Wan22AutoBlocks(SequentialPipelineBlocks): | |
| block_classes = [ | |
| WanTextEncoderStep, | |
| WanAutoVaeImageEncoderStep, | |
| Wan22AutoDenoiseStep, | |
| WanImageVaeDecoderStep, | |
| ] | |
| block_names = [ | |
| "text_encoder", | |
| "vae_encoder", | |
| "denoise", | |
| "decode", | |
| ] | |
| def description(self): | |
| return ( | |
| "Auto Modular pipeline for text-to-video using Wan2.2.\n" | |
| + "- for text-to-video generation, all you need to provide is `prompt`" | |
| ) | |
| # presets for wan2.1 and wan2.2 | |
| # YiYi Notes: should we move these to doc? | |
| # wan2.1 | |
| TEXT2VIDEO_BLOCKS = InsertableDict( | |
| [ | |
| ("text_encoder", WanTextEncoderStep), | |
| ("input", WanTextInputStep), | |
| ("set_timesteps", WanSetTimestepsStep), | |
| ("prepare_latents", WanPrepareLatentsStep), | |
| ("denoise", WanDenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| IMAGE2VIDEO_BLOCKS = InsertableDict( | |
| [ | |
| ("image_resize", WanImageResizeStep), | |
| ("image_encoder", WanImage2VideoImageEncoderStep), | |
| ("vae_encoder", WanImage2VideoVaeImageEncoderStep), | |
| ("input", WanTextInputStep), | |
| ("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"])), | |
| ("set_timesteps", WanSetTimestepsStep), | |
| ("prepare_latents", WanPrepareLatentsStep), | |
| ("prepare_first_frame_latents", WanPrepareFirstFrameLatentsStep), | |
| ("denoise", WanImage2VideoDenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| FLF2V_BLOCKS = InsertableDict( | |
| [ | |
| ("image_resize", WanImageResizeStep), | |
| ("last_image_resize", WanImageCropResizeStep), | |
| ("image_encoder", WanFLF2VImageEncoderStep), | |
| ("vae_encoder", WanFLF2VVaeImageEncoderStep), | |
| ("input", WanTextInputStep), | |
| ("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"])), | |
| ("set_timesteps", WanSetTimestepsStep), | |
| ("prepare_latents", WanPrepareLatentsStep), | |
| ("prepare_first_last_frame_latents", WanPrepareFirstLastFrameLatentsStep), | |
| ("denoise", WanFLF2VDenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| AUTO_BLOCKS = InsertableDict( | |
| [ | |
| ("text_encoder", WanTextEncoderStep), | |
| ("image_encoder", WanAutoImageEncoderStep), | |
| ("vae_encoder", WanAutoVaeImageEncoderStep), | |
| ("denoise", WanAutoDenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| # wan2.2 presets | |
| TEXT2VIDEO_BLOCKS_WAN22 = InsertableDict( | |
| [ | |
| ("text_encoder", WanTextEncoderStep), | |
| ("input", WanTextInputStep), | |
| ("set_timesteps", WanSetTimestepsStep), | |
| ("prepare_latents", WanPrepareLatentsStep), | |
| ("denoise", Wan22DenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| IMAGE2VIDEO_BLOCKS_WAN22 = InsertableDict( | |
| [ | |
| ("image_resize", WanImageResizeStep), | |
| ("vae_encoder", WanImage2VideoVaeImageEncoderStep), | |
| ("input", WanTextInputStep), | |
| ("set_timesteps", WanSetTimestepsStep), | |
| ("prepare_latents", WanPrepareLatentsStep), | |
| ("denoise", Wan22DenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| AUTO_BLOCKS_WAN22 = InsertableDict( | |
| [ | |
| ("text_encoder", WanTextEncoderStep), | |
| ("vae_encoder", WanAutoVaeImageEncoderStep), | |
| ("denoise", Wan22AutoDenoiseStep), | |
| ("decode", WanImageVaeDecoderStep), | |
| ] | |
| ) | |
| # presets all blocks (wan and wan22) | |
| ALL_BLOCKS = { | |
| "wan2.1": { | |
| "text2video": TEXT2VIDEO_BLOCKS, | |
| "image2video": IMAGE2VIDEO_BLOCKS, | |
| "flf2v": FLF2V_BLOCKS, | |
| "auto": AUTO_BLOCKS, | |
| }, | |
| "wan2.2": { | |
| "text2video": TEXT2VIDEO_BLOCKS_WAN22, | |
| "image2video": IMAGE2VIDEO_BLOCKS_WAN22, | |
| "auto": AUTO_BLOCKS_WAN22, | |
| }, | |
| } | |