Add files using upload-large-folder tool

be9fa39 verified 5 months ago

7.21 kB

	# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import List, Union

	import numpy as np
	import PIL
	import torch

	from ...configuration_utils import FrozenDict
	from ...image_processor import InpaintProcessor, VaeImageProcessor
	from ...models import AutoencoderKLQwenImage
	from ...utils import logging
	from ..modular_pipeline import ModularPipelineBlocks, PipelineState
	from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
	from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier


	logger = logging.get_logger(__name__)


	class QwenImageDecoderStep(ModularPipelineBlocks):
	model_name = "qwenimage"

	@property
	def description(self) -> str:
	return "Step that decodes the latents to images"

	@property
	def expected_components(self) -> List[ComponentSpec]:
	components = [
	ComponentSpec("vae", AutoencoderKLQwenImage),
	ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
	]

	return components

	@property
	def inputs(self) -> List[InputParam]:
	return [
	InputParam(name="height", required=True),
	InputParam(name="width", required=True),
	InputParam(
	name="latents",
	required=True,
	type_hint=torch.Tensor,
	description="The latents to decode, can be generated in the denoise step",
	),
	]

	@property
	def intermediate_outputs(self) -> List[str]:
	return [
	OutputParam(
	"images",
	type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
	description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
	)
	]

	@torch.no_grad()
	def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
	block_state = self.get_block_state(state)

	# YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
	block_state.latents = components.pachifier.unpack_latents(
	block_state.latents, block_state.height, block_state.width
	)
	block_state.latents = block_state.latents.to(components.vae.dtype)

	latents_mean = (
	torch.tensor(components.vae.config.latents_mean)
	.view(1, components.vae.config.z_dim, 1, 1, 1)
	.to(block_state.latents.device, block_state.latents.dtype)
	)
	latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
	1, components.vae.config.z_dim, 1, 1, 1
	).to(block_state.latents.device, block_state.latents.dtype)
	block_state.latents = block_state.latents / latents_std + latents_mean
	block_state.images = components.vae.decode(block_state.latents, return_dict=False)[0][:, :, 0]

	self.set_block_state(state, block_state)
	return components, state


	class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
	model_name = "qwenimage"

	@property
	def description(self) -> str:
	return "postprocess the generated image"

	@property
	def expected_components(self) -> List[ComponentSpec]:
	return [
	ComponentSpec(
	"image_processor",
	VaeImageProcessor,
	config=FrozenDict({"vae_scale_factor": 16}),
	default_creation_method="from_config",
	),
	]

	@property
	def inputs(self) -> List[InputParam]:
	return [
	InputParam("images", required=True, description="the generated image from decoders step"),
	InputParam(
	name="output_type",
	default="pil",
	type_hint=str,
	description="The type of the output images, can be 'pil', 'np', 'pt'",
	),
	]

	@staticmethod
	def check_inputs(output_type):
	if output_type not in ["pil", "np", "pt"]:
	raise ValueError(f"Invalid output_type: {output_type}")

	@torch.no_grad()
	def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
	block_state = self.get_block_state(state)

	self.check_inputs(block_state.output_type)

	block_state.images = components.image_processor.postprocess(
	image=block_state.images,
	output_type=block_state.output_type,
	)

	self.set_block_state(state, block_state)
	return components, state


	class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
	model_name = "qwenimage"

	@property
	def description(self) -> str:
	return "postprocess the generated image, optional apply the mask overally to the original image.."

	@property
	def expected_components(self) -> List[ComponentSpec]:
	return [
	ComponentSpec(
	"image_mask_processor",
	InpaintProcessor,
	config=FrozenDict({"vae_scale_factor": 16}),
	default_creation_method="from_config",
	),
	]

	@property
	def inputs(self) -> List[InputParam]:
	return [
	InputParam("images", required=True, description="the generated image from decoders step"),
	InputParam(
	name="output_type",
	default="pil",
	type_hint=str,
	description="The type of the output images, can be 'pil', 'np', 'pt'",
	),
	InputParam("mask_overlay_kwargs"),
	]

	@staticmethod
	def check_inputs(output_type, mask_overlay_kwargs):
	if output_type not in ["pil", "np", "pt"]:
	raise ValueError(f"Invalid output_type: {output_type}")

	if mask_overlay_kwargs and output_type != "pil":
	raise ValueError("only support output_type 'pil' for mask overlay")

	@torch.no_grad()
	def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
	block_state = self.get_block_state(state)

	self.check_inputs(block_state.output_type, block_state.mask_overlay_kwargs)

	if block_state.mask_overlay_kwargs is None:
	mask_overlay_kwargs = {}
	else:
	mask_overlay_kwargs = block_state.mask_overlay_kwargs

	block_state.images = components.image_mask_processor.postprocess(
	image=block_state.images,
	**mask_overlay_kwargs,
	)

	self.set_block_state(state, block_state)
	return components, state