Spaces:

iimmortall
/

InstantRetouch

Running on Zero

App Files Files Community

InstantRetouch / vendor /diffusers /modular_pipelines /wan /modular_blocks.py

iimmortall

Deploy InstantRetouch BILA ZeroGPU Space

bc275c2 verified 2 days ago

raw

history blame contribute delete

16.7 kB

	# Copyright 2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from ...utils import logging
	from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
	from ..modular_pipeline_utils import InsertableDict
	from .before_denoise import (
	WanAdditionalInputsStep,
	WanPrepareFirstFrameLatentsStep,
	WanPrepareFirstLastFrameLatentsStep,
	WanPrepareLatentsStep,
	WanSetTimestepsStep,
	WanTextInputStep,
	)
	from .decoders import WanImageVaeDecoderStep
	from .denoise import (
	Wan22DenoiseStep,
	Wan22Image2VideoDenoiseStep,
	WanDenoiseStep,
	WanFLF2VDenoiseStep,
	WanImage2VideoDenoiseStep,
	)
	from .encoders import (
	WanFirstLastFrameImageEncoderStep,
	WanFirstLastFrameVaeImageEncoderStep,
	WanImageCropResizeStep,
	WanImageEncoderStep,
	WanImageResizeStep,
	WanTextEncoderStep,
	WanVaeImageEncoderStep,
	)


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	# wan2.1
	# wan2.1: text2vid
	class WanCoreDenoiseStep(SequentialPipelineBlocks):
	block_classes = [
	WanTextInputStep,
	WanSetTimestepsStep,
	WanPrepareLatentsStep,
	WanDenoiseStep,
	]
	block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]

	@property
	def description(self):
	return (
	"denoise block that takes encoded conditions and runs the denoising process.\n"
	+ "This is a sequential pipeline blocks:\n"
	+ " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
	+ " - `WanSetTimestepsStep` is used to set the timesteps\n"
	+ " - `WanPrepareLatentsStep` is used to prepare the latents\n"
	+ " - `WanDenoiseStep` is used to denoise the latents\n"
	)


	# wan2.1: image2video
	## image encoder
	class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
	model_name = "wan"
	block_classes = [WanImageResizeStep, WanImageEncoderStep]
	block_names = ["image_resize", "image_encoder"]

	@property
	def description(self):
	return "Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings"


	## vae encoder
	class WanImage2VideoVaeImageEncoderStep(SequentialPipelineBlocks):
	model_name = "wan"
	block_classes = [WanImageResizeStep, WanVaeImageEncoderStep]
	block_names = ["image_resize", "vae_encoder"]

	@property
	def description(self):
	return "Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent representation"


	## denoise
	class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
	block_classes = [
	WanTextInputStep,
	WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"]),
	WanSetTimestepsStep,
	WanPrepareLatentsStep,
	WanPrepareFirstFrameLatentsStep,
	WanImage2VideoDenoiseStep,
	]
	block_names = [
	"input",
	"additional_inputs",
	"set_timesteps",
	"prepare_latents",
	"prepare_first_frame_latents",
	"denoise",
	]

	@property
	def description(self):
	return (
	"denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
	+ "This is a sequential pipeline blocks:\n"
	+ " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
	+ " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
	+ " - `WanSetTimestepsStep` is used to set the timesteps\n"
	+ " - `WanPrepareLatentsStep` is used to prepare the latents\n"
	+ " - `WanPrepareFirstFrameLatentsStep` is used to prepare the first frame latent conditions\n"
	+ " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
	)


	# wan2.1: FLF2v


	## image encoder
	class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
	model_name = "wan"
	block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep]
	block_names = ["image_resize", "last_image_resize", "image_encoder"]

	@property
	def description(self):
	return "FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image embeddings"


	## vae encoder
	class WanFLF2VVaeImageEncoderStep(SequentialPipelineBlocks):
	model_name = "wan"
	block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameVaeImageEncoderStep]
	block_names = ["image_resize", "last_image_resize", "vae_encoder"]

	@property
	def description(self):
	return "FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the latent conditions"


	## denoise
	class WanFLF2VCoreDenoiseStep(SequentialPipelineBlocks):
	block_classes = [
	WanTextInputStep,
	WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"]),
	WanSetTimestepsStep,
	WanPrepareLatentsStep,
	WanPrepareFirstLastFrameLatentsStep,
	WanFLF2VDenoiseStep,
	]
	block_names = [
	"input",
	"additional_inputs",
	"set_timesteps",
	"prepare_latents",
	"prepare_first_last_frame_latents",
	"denoise",
	]

	@property
	def description(self):
	return (
	"denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
	+ "This is a sequential pipeline blocks:\n"
	+ " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
	+ " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
	+ " - `WanSetTimestepsStep` is used to set the timesteps\n"
	+ " - `WanPrepareLatentsStep` is used to prepare the latents\n"
	+ " - `WanPrepareFirstLastFrameLatentsStep` is used to prepare the latent conditions\n"
	+ " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
	)


	# wan2.1: auto blocks
	## image encoder
	class WanAutoImageEncoderStep(AutoPipelineBlocks):
	block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep]
	block_names = ["flf2v_image_encoder", "image2video_image_encoder"]
	block_trigger_inputs = ["last_image", "image"]

	@property
	def description(self):
	return (
	"Image Encoder step that encode the image to generate the image embeddings"
	+ "This is an auto pipeline block that works for image2video tasks."
	+ " - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided."
	+ " - `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided."
	+ " - if `last_image` or `image` is not provided, step will be skipped."
	)


	## vae encoder
	class WanAutoVaeImageEncoderStep(AutoPipelineBlocks):
	block_classes = [WanFLF2VVaeImageEncoderStep, WanImage2VideoVaeImageEncoderStep]
	block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"]
	block_trigger_inputs = ["last_image", "image"]

	@property
	def description(self):
	return (
	"Vae Image Encoder step that encode the image to generate the image latents"
	+ "This is an auto pipeline block that works for image2video tasks."
	+ " - `WanFLF2VVaeImageEncoderStep` (flf2v) is used when `last_image` is provided."
	+ " - `WanImage2VideoVaeImageEncoderStep` (image2video) is used when `image` is provided."
	+ " - if `last_image` or `image` is not provided, step will be skipped."
	)


	## denoise
	class WanAutoDenoiseStep(AutoPipelineBlocks):
	block_classes = [
	WanFLF2VCoreDenoiseStep,
	WanImage2VideoCoreDenoiseStep,
	WanCoreDenoiseStep,
	]
	block_names = ["flf2v", "image2video", "text2video"]
	block_trigger_inputs = ["first_last_frame_latents", "first_frame_latents", None]

	@property
	def description(self) -> str:
	return (
	"Denoise step that iteratively denoise the latents. "
	"This is a auto pipeline block that works for text2video and image2video tasks."
	" - `WanCoreDenoiseStep` (text2video) for text2vid tasks."
	" - `WanCoreImage2VideoCoreDenoiseStep` (image2video) for image2video tasks."
	+ " - if `first_frame_latents` is provided, `WanCoreImage2VideoDenoiseStep` will be used.\n"
	+ " - if `first_frame_latents` is not provided, `WanCoreDenoiseStep` will be used.\n"
	)


	# auto pipeline blocks
	class WanAutoBlocks(SequentialPipelineBlocks):
	block_classes = [
	WanTextEncoderStep,
	WanAutoImageEncoderStep,
	WanAutoVaeImageEncoderStep,
	WanAutoDenoiseStep,
	WanImageVaeDecoderStep,
	]
	block_names = [
	"text_encoder",
	"image_encoder",
	"vae_encoder",
	"denoise",
	"decode",
	]

	@property
	def description(self):
	return (
	"Auto Modular pipeline for text-to-video using Wan.\n"
	+ "- for text-to-video generation, all you need to provide is `prompt`"
	)


	# wan22
	# wan2.2: text2vid


	## denoise
	class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
	block_classes = [
	WanTextInputStep,
	WanSetTimestepsStep,
	WanPrepareLatentsStep,
	Wan22DenoiseStep,
	]
	block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]

	@property
	def description(self):
	return (
	"denoise block that takes encoded conditions and runs the denoising process.\n"
	+ "This is a sequential pipeline blocks:\n"
	+ " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
	+ " - `WanSetTimestepsStep` is used to set the timesteps\n"
	+ " - `WanPrepareLatentsStep` is used to prepare the latents\n"
	+ " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n"
	)


	# wan2.2: image2video
	## denoise
	class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
	block_classes = [
	WanTextInputStep,
	WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"]),
	WanSetTimestepsStep,
	WanPrepareLatentsStep,
	WanPrepareFirstFrameLatentsStep,
	Wan22Image2VideoDenoiseStep,
	]
	block_names = [
	"input",
	"additional_inputs",
	"set_timesteps",
	"prepare_latents",
	"prepare_first_frame_latents",
	"denoise",
	]

	@property
	def description(self):
	return (
	"denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
	+ "This is a sequential pipeline blocks:\n"
	+ " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
	+ " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
	+ " - `WanSetTimestepsStep` is used to set the timesteps\n"
	+ " - `WanPrepareLatentsStep` is used to prepare the latents\n"
	+ " - `WanPrepareFirstFrameLatentsStep` is used to prepare the first frame latent conditions\n"
	+ " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n"
	)


	class Wan22AutoDenoiseStep(AutoPipelineBlocks):
	block_classes = [
	Wan22Image2VideoCoreDenoiseStep,
	Wan22CoreDenoiseStep,
	]
	block_names = ["image2video", "text2video"]
	block_trigger_inputs = ["first_frame_latents", None]

	@property
	def description(self) -> str:
	return (
	"Denoise step that iteratively denoise the latents. "
	"This is a auto pipeline block that works for text2video and image2video tasks."
	" - `Wan22Image2VideoCoreDenoiseStep` (image2video) for image2video tasks."
	" - `Wan22CoreDenoiseStep` (text2video) for text2vid tasks."
	+ " - if `first_frame_latents` is provided, `Wan22Image2VideoCoreDenoiseStep` will be used.\n"
	+ " - if `first_frame_latents` is not provided, `Wan22CoreDenoiseStep` will be used.\n"
	)


	class Wan22AutoBlocks(SequentialPipelineBlocks):
	block_classes = [
	WanTextEncoderStep,
	WanAutoVaeImageEncoderStep,
	Wan22AutoDenoiseStep,
	WanImageVaeDecoderStep,
	]
	block_names = [
	"text_encoder",
	"vae_encoder",
	"denoise",
	"decode",
	]

	@property
	def description(self):
	return (
	"Auto Modular pipeline for text-to-video using Wan2.2.\n"
	+ "- for text-to-video generation, all you need to provide is `prompt`"
	)


	# presets for wan2.1 and wan2.2
	# YiYi Notes: should we move these to doc?
	# wan2.1
	TEXT2VIDEO_BLOCKS = InsertableDict(
	[
	("text_encoder", WanTextEncoderStep),
	("input", WanTextInputStep),
	("set_timesteps", WanSetTimestepsStep),
	("prepare_latents", WanPrepareLatentsStep),
	("denoise", WanDenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)

	IMAGE2VIDEO_BLOCKS = InsertableDict(
	[
	("image_resize", WanImageResizeStep),
	("image_encoder", WanImage2VideoImageEncoderStep),
	("vae_encoder", WanImage2VideoVaeImageEncoderStep),
	("input", WanTextInputStep),
	("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"])),
	("set_timesteps", WanSetTimestepsStep),
	("prepare_latents", WanPrepareLatentsStep),
	("prepare_first_frame_latents", WanPrepareFirstFrameLatentsStep),
	("denoise", WanImage2VideoDenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)


	FLF2V_BLOCKS = InsertableDict(
	[
	("image_resize", WanImageResizeStep),
	("last_image_resize", WanImageCropResizeStep),
	("image_encoder", WanFLF2VImageEncoderStep),
	("vae_encoder", WanFLF2VVaeImageEncoderStep),
	("input", WanTextInputStep),
	("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"])),
	("set_timesteps", WanSetTimestepsStep),
	("prepare_latents", WanPrepareLatentsStep),
	("prepare_first_last_frame_latents", WanPrepareFirstLastFrameLatentsStep),
	("denoise", WanFLF2VDenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)

	AUTO_BLOCKS = InsertableDict(
	[
	("text_encoder", WanTextEncoderStep),
	("image_encoder", WanAutoImageEncoderStep),
	("vae_encoder", WanAutoVaeImageEncoderStep),
	("denoise", WanAutoDenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)

	# wan2.2 presets

	TEXT2VIDEO_BLOCKS_WAN22 = InsertableDict(
	[
	("text_encoder", WanTextEncoderStep),
	("input", WanTextInputStep),
	("set_timesteps", WanSetTimestepsStep),
	("prepare_latents", WanPrepareLatentsStep),
	("denoise", Wan22DenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)

	IMAGE2VIDEO_BLOCKS_WAN22 = InsertableDict(
	[
	("image_resize", WanImageResizeStep),
	("vae_encoder", WanImage2VideoVaeImageEncoderStep),
	("input", WanTextInputStep),
	("set_timesteps", WanSetTimestepsStep),
	("prepare_latents", WanPrepareLatentsStep),
	("denoise", Wan22DenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)

	AUTO_BLOCKS_WAN22 = InsertableDict(
	[
	("text_encoder", WanTextEncoderStep),
	("vae_encoder", WanAutoVaeImageEncoderStep),
	("denoise", Wan22AutoDenoiseStep),
	("decode", WanImageVaeDecoderStep),
	]
	)

	# presets all blocks (wan and wan22)


	ALL_BLOCKS = {
	"wan2.1": {
	"text2video": TEXT2VIDEO_BLOCKS,
	"image2video": IMAGE2VIDEO_BLOCKS,
	"flf2v": FLF2V_BLOCKS,
	"auto": AUTO_BLOCKS,
	},
	"wan2.2": {
	"text2video": TEXT2VIDEO_BLOCKS_WAN22,
	"image2video": IMAGE2VIDEO_BLOCKS_WAN22,
	"auto": AUTO_BLOCKS_WAN22,
	},
	}