Add files using upload-large-folder tool

4f4376a verified 3 months ago

12.5 kB

	# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from ...utils import logging
	from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
	from ..modular_pipeline_utils import OutputParam
	from .before_denoise import (
	ZImageAdditionalInputsStep,
	ZImagePrepareLatentsStep,
	ZImagePrepareLatentswithImageStep,
	ZImageSetTimestepsStep,
	ZImageSetTimestepsWithStrengthStep,
	ZImageTextInputStep,
	)
	from .decoders import ZImageVaeDecoderStep
	from .denoise import (
	ZImageDenoiseStep,
	)
	from .encoders import (
	ZImageTextEncoderStep,
	ZImageVaeImageEncoderStep,
	)


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	# ====================
	# 1. DENOISE
	# ====================


	# text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
	# auto_docstring
	class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
	"""
	denoise block that takes encoded conditions and runs the denoising process.

	Components:
	transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
	(`ClassifierFreeGuidance`)

	Inputs:
	num_images_per_prompt (`None`, optional, defaults to 1):
	TODO: Add description.
	prompt_embeds (`list`):
	Pre-generated text embeddings. Can be generated from text_encoder step.
	negative_prompt_embeds (`list`, optional):
	Pre-generated negative text embeddings. Can be generated from text_encoder step.
	height (`int`, optional):
	TODO: Add description.
	width (`int`, optional):
	TODO: Add description.
	latents (`Tensor \| NoneType`, optional):
	TODO: Add description.
	generator (`None`, optional):
	TODO: Add description.
	num_inference_steps (`None`, optional, defaults to 9):
	TODO: Add description.
	sigmas (`None`, optional):
	TODO: Add description.
	*denoiser_input_fields (`None`, optional*):
	The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

	Outputs:
	latents (`Tensor`):
	Denoised latents.
	"""

	block_classes = [
	ZImageTextInputStep,
	ZImagePrepareLatentsStep,
	ZImageSetTimestepsStep,
	ZImageDenoiseStep,
	]
	block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]

	@property
	def description(self):
	return "denoise block that takes encoded conditions and runs the denoising process."

	@property
	def outputs(self):
	return [OutputParam.template("latents")]


	# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
	# auto_docstring
	class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
	"""
	denoise block that takes encoded text and image latent conditions and runs the denoising process.

	Components:
	transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
	(`ClassifierFreeGuidance`)

	Inputs:
	num_images_per_prompt (`None`, optional, defaults to 1):
	TODO: Add description.
	prompt_embeds (`list`):
	Pre-generated text embeddings. Can be generated from text_encoder step.
	negative_prompt_embeds (`list`, optional):
	Pre-generated negative text embeddings. Can be generated from text_encoder step.
	height (`None`, optional):
	TODO: Add description.
	width (`None`, optional):
	TODO: Add description.
	image_latents (`None`, optional):
	TODO: Add description.
	latents (`Tensor \| NoneType`, optional):
	TODO: Add description.
	generator (`None`, optional):
	TODO: Add description.
	num_inference_steps (`None`, optional, defaults to 9):
	TODO: Add description.
	sigmas (`None`, optional):
	TODO: Add description.
	strength (`None`, optional, defaults to 0.6):
	TODO: Add description.
	*denoiser_input_fields (`None`, optional*):
	The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

	Outputs:
	latents (`Tensor`):
	Denoised latents.
	"""

	block_classes = [
	ZImageTextInputStep,
	ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
	ZImagePrepareLatentsStep,
	ZImageSetTimestepsStep,
	ZImageSetTimestepsWithStrengthStep,
	ZImagePrepareLatentswithImageStep,
	ZImageDenoiseStep,
	]
	block_names = [
	"input",
	"additional_inputs",
	"prepare_latents",
	"set_timesteps",
	"set_timesteps_with_strength",
	"prepare_latents_with_image",
	"denoise",
	]

	@property
	def description(self):
	return "denoise block that takes encoded text and image latent conditions and runs the denoising process."

	@property
	def outputs(self):
	return [OutputParam.template("latents")]


	# auto_docstring
	class ZImageAutoDenoiseStep(AutoPipelineBlocks):
	"""
	Denoise step that iteratively denoise the latents. This is a auto pipeline block that works for text2image and
	image2image tasks. - `ZImageCoreDenoiseStep` (text2image) for text2image tasks. -
	`ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks. - if `image_latents` is provided,
	`ZImageImage2ImageCoreDenoiseStep` will be used.
	- if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.

	Components:
	transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
	(`ClassifierFreeGuidance`)

	Inputs:
	num_images_per_prompt (`None`, optional, defaults to 1):
	TODO: Add description.
	prompt_embeds (`list`):
	Pre-generated text embeddings. Can be generated from text_encoder step.
	negative_prompt_embeds (`list`, optional):
	Pre-generated negative text embeddings. Can be generated from text_encoder step.
	height (`None`, optional):
	TODO: Add description.
	width (`None`, optional):
	TODO: Add description.
	image_latents (`None`, optional):
	TODO: Add description.
	latents (`Tensor \| NoneType`):
	TODO: Add description.
	generator (`None`, optional):
	TODO: Add description.
	num_inference_steps (`None`):
	TODO: Add description.
	sigmas (`None`, optional):
	TODO: Add description.
	strength (`None`, optional, defaults to 0.6):
	TODO: Add description.
	*denoiser_input_fields (`None`, optional*):
	The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

	Outputs:
	latents (`Tensor`):
	Denoised latents.
	"""

	block_classes = [
	ZImageImage2ImageCoreDenoiseStep,
	ZImageCoreDenoiseStep,
	]
	block_names = ["image2image", "text2image"]
	block_trigger_inputs = ["image_latents", None]

	@property
	def description(self) -> str:
	return (
	"Denoise step that iteratively denoise the latents. "
	"This is a auto pipeline block that works for text2image and image2image tasks."
	" - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
	" - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
	+ " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
	+ " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
	)


	# auto_docstring
	class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
	"""
	Vae Image Encoder step that encode the image to generate the image latents

	Components:
	vae (`AutoencoderKL`) image_processor (`VaeImageProcessor`)

	Inputs:
	image (`Image`, optional):
	TODO: Add description.
	height (`None`, optional):
	TODO: Add description.
	width (`None`, optional):
	TODO: Add description.
	generator (`None`, optional):
	TODO: Add description.

	Outputs:
	image_latents (`Tensor`):
	video latent representation with the first frame image condition
	"""

	block_classes = [ZImageVaeImageEncoderStep]
	block_names = ["vae_encoder"]
	block_trigger_inputs = ["image"]

	@property
	def description(self) -> str:
	return "Vae Image Encoder step that encode the image to generate the image latents"
	+"This is an auto pipeline block that works for image2image tasks."
	+" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
	+" - if `image` is not provided, step will be skipped."


	# auto_docstring
	class ZImageAutoBlocks(SequentialPipelineBlocks):
	"""
	Auto Modular pipeline for text-to-image and image-to-image using ZImage.

	Supported workflows:
	- `text2image`: requires `prompt`
	- `image2image`: requires `image`, `prompt`

	Components:
	text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) guider (`ClassifierFreeGuidance`) vae
	(`AutoencoderKL`) image_processor (`VaeImageProcessor`) transformer (`ZImageTransformer2DModel`) scheduler
	(`FlowMatchEulerDiscreteScheduler`)

	Inputs:
	prompt (`None`, optional):
	TODO: Add description.
	negative_prompt (`None`, optional):
	TODO: Add description.
	max_sequence_length (`None`, optional, defaults to 512):
	TODO: Add description.
	image (`Image`, optional):
	TODO: Add description.
	height (`None`, optional):
	TODO: Add description.
	width (`None`, optional):
	TODO: Add description.
	generator (`None`, optional):
	TODO: Add description.
	num_images_per_prompt (`None`, optional, defaults to 1):
	TODO: Add description.
	image_latents (`None`, optional):
	TODO: Add description.
	latents (`Tensor \| NoneType`):
	TODO: Add description.
	num_inference_steps (`None`):
	TODO: Add description.
	sigmas (`None`, optional):
	TODO: Add description.
	strength (`None`, optional, defaults to 0.6):
	TODO: Add description.
	*denoiser_input_fields (`None`, optional*):
	The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
	output_type (`str`, optional, defaults to pil):
	The type of the output images, can be 'pil', 'np', 'pt'

	Outputs:
	images (`list`):
	Generated images.
	"""

	block_classes = [
	ZImageTextEncoderStep,
	ZImageAutoVaeImageEncoderStep,
	ZImageAutoDenoiseStep,
	ZImageVaeDecoderStep,
	]
	block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
	_workflow_map = {
	"text2image": {"prompt": True},
	"image2image": {"image": True, "prompt": True},
	}

	@property
	def description(self) -> str:
	return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."

	@property
	def outputs(self):
	return [OutputParam.template("images")]