build-tools / diffusers /modular_pipelines /z_image /modular_blocks_z_image.py
salmankhanpm's picture
Add files using upload-large-folder tool
4f4376a verified
# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
ZImageAdditionalInputsStep,
ZImagePrepareLatentsStep,
ZImagePrepareLatentswithImageStep,
ZImageSetTimestepsStep,
ZImageSetTimestepsWithStrengthStep,
ZImageTextInputStep,
)
from .decoders import ZImageVaeDecoderStep
from .denoise import (
ZImageDenoiseStep,
)
from .encoders import (
ZImageTextEncoderStep,
ZImageVaeImageEncoderStep,
)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# ====================
# 1. DENOISE
# ====================
# text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
# auto_docstring
class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
"""
denoise block that takes encoded conditions and runs the denoising process.
Components:
transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`)
Inputs:
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
prompt_embeds (`list`):
Pre-generated text embeddings. Can be generated from text_encoder step.
negative_prompt_embeds (`list`, *optional*):
Pre-generated negative text embeddings. Can be generated from text_encoder step.
height (`int`, *optional*):
TODO: Add description.
width (`int`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_inference_steps (`None`, *optional*, defaults to 9):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
**denoiser_input_fields (`None`, *optional*):
The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
block_classes = [
ZImageTextInputStep,
ZImagePrepareLatentsStep,
ZImageSetTimestepsStep,
ZImageDenoiseStep,
]
block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]
@property
def description(self):
return "denoise block that takes encoded conditions and runs the denoising process."
@property
def outputs(self):
return [OutputParam.template("latents")]
# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
# auto_docstring
class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
"""
denoise block that takes encoded text and image latent conditions and runs the denoising process.
Components:
transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`)
Inputs:
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
prompt_embeds (`list`):
Pre-generated text embeddings. Can be generated from text_encoder step.
negative_prompt_embeds (`list`, *optional*):
Pre-generated negative text embeddings. Can be generated from text_encoder step.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
image_latents (`None`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_inference_steps (`None`, *optional*, defaults to 9):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
strength (`None`, *optional*, defaults to 0.6):
TODO: Add description.
**denoiser_input_fields (`None`, *optional*):
The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
block_classes = [
ZImageTextInputStep,
ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
ZImagePrepareLatentsStep,
ZImageSetTimestepsStep,
ZImageSetTimestepsWithStrengthStep,
ZImagePrepareLatentswithImageStep,
ZImageDenoiseStep,
]
block_names = [
"input",
"additional_inputs",
"prepare_latents",
"set_timesteps",
"set_timesteps_with_strength",
"prepare_latents_with_image",
"denoise",
]
@property
def description(self):
return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
@property
def outputs(self):
return [OutputParam.template("latents")]
# auto_docstring
class ZImageAutoDenoiseStep(AutoPipelineBlocks):
"""
Denoise step that iteratively denoise the latents. This is a auto pipeline block that works for text2image and
image2image tasks. - `ZImageCoreDenoiseStep` (text2image) for text2image tasks. -
`ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks. - if `image_latents` is provided,
`ZImageImage2ImageCoreDenoiseStep` will be used.
- if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.
Components:
transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`)
Inputs:
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
prompt_embeds (`list`):
Pre-generated text embeddings. Can be generated from text_encoder step.
negative_prompt_embeds (`list`, *optional*):
Pre-generated negative text embeddings. Can be generated from text_encoder step.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
image_latents (`None`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_inference_steps (`None`):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
strength (`None`, *optional*, defaults to 0.6):
TODO: Add description.
**denoiser_input_fields (`None`, *optional*):
The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
block_classes = [
ZImageImage2ImageCoreDenoiseStep,
ZImageCoreDenoiseStep,
]
block_names = ["image2image", "text2image"]
block_trigger_inputs = ["image_latents", None]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. "
"This is a auto pipeline block that works for text2image and image2image tasks."
" - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
" - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
+ " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
+ " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
)
# auto_docstring
class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
"""
Vae Image Encoder step that encode the image to generate the image latents
Components:
vae (`AutoencoderKL`) image_processor (`VaeImageProcessor`)
Inputs:
image (`Image`, *optional*):
TODO: Add description.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
Outputs:
image_latents (`Tensor`):
video latent representation with the first frame image condition
"""
block_classes = [ZImageVaeImageEncoderStep]
block_names = ["vae_encoder"]
block_trigger_inputs = ["image"]
@property
def description(self) -> str:
return "Vae Image Encoder step that encode the image to generate the image latents"
+"This is an auto pipeline block that works for image2image tasks."
+" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
+" - if `image` is not provided, step will be skipped."
# auto_docstring
class ZImageAutoBlocks(SequentialPipelineBlocks):
"""
Auto Modular pipeline for text-to-image and image-to-image using ZImage.
Supported workflows:
- `text2image`: requires `prompt`
- `image2image`: requires `image`, `prompt`
Components:
text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) guider (`ClassifierFreeGuidance`) vae
(`AutoencoderKL`) image_processor (`VaeImageProcessor`) transformer (`ZImageTransformer2DModel`) scheduler
(`FlowMatchEulerDiscreteScheduler`)
Inputs:
prompt (`None`, *optional*):
TODO: Add description.
negative_prompt (`None`, *optional*):
TODO: Add description.
max_sequence_length (`None`, *optional*, defaults to 512):
TODO: Add description.
image (`Image`, *optional*):
TODO: Add description.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
image_latents (`None`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`):
TODO: Add description.
num_inference_steps (`None`):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
strength (`None`, *optional*, defaults to 0.6):
TODO: Add description.
**denoiser_input_fields (`None`, *optional*):
The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
The type of the output images, can be 'pil', 'np', 'pt'
Outputs:
images (`list`):
Generated images.
"""
block_classes = [
ZImageTextEncoderStep,
ZImageAutoVaeImageEncoderStep,
ZImageAutoDenoiseStep,
ZImageVaeDecoderStep,
]
block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
_workflow_map = {
"text2image": {"prompt": True},
"image2image": {"image": True, "prompt": True},
}
@property
def description(self) -> str:
return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."
@property
def outputs(self):
return [OutputParam.template("images")]