# Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import PIL import torch from ...image_processor import PipelineImageInput from ...loaders import ModularIPAdapterMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin from ...pipelines.pipeline_utils import StableDiffusionMixin from ...pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from ...utils import logging from ..modular_pipeline import ModularPipeline from ..modular_pipeline_utils import InputParam, OutputParam logger = logging.get_logger(__name__) # pylint: disable=invalid-name # YiYi TODO: move to a different file? stable_diffusion_xl_module should have its own folder? # YiYi Notes: model specific components: ## (1) it should inherit from ModularPipeline ## (2) acts like a container that holds components and configs ## (3) define default config (related to components), e.g. default_sample_size, vae_scale_factor, num_channels_unet, num_channels_latents ## (4) inherit from model-specic loader class (e.g. StableDiffusionXLLoraLoaderMixin) ## (5) how to use together with Components_manager? class StableDiffusionXLModularPipeline( ModularPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, ModularIPAdapterMixin, ): """ A ModularPipeline for Stable Diffusion XL. > [!WARNING] > This is an experimental feature and is likely to change in the future. """ default_blocks_name = "StableDiffusionXLAutoBlocks" @property def default_height(self): return self.default_sample_size * self.vae_scale_factor @property def default_width(self): return self.default_sample_size * self.vae_scale_factor @property def default_sample_size(self): default_sample_size = 128 if hasattr(self, "unet") and self.unet is not None: default_sample_size = self.unet.config.sample_size return default_sample_size @property def vae_scale_factor(self): vae_scale_factor = 8 if hasattr(self, "vae") and self.vae is not None: vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) return vae_scale_factor # YiYi TODO: change to num_channels_latents @property def num_channels_unet(self): num_channels_unet = 4 if hasattr(self, "unet") and self.unet is not None: num_channels_unet = self.unet.config.in_channels return num_channels_unet @property def num_channels_latents(self): num_channels_latents = 4 if hasattr(self, "vae") and self.vae is not None: num_channels_latents = self.vae.config.latent_channels return num_channels_latents # YiYi/Sayak TODO: not used yet, maintain a list of schema that can be used across all pipeline blocks # auto_docstring SDXL_INPUTS_SCHEMA = { "prompt": InputParam( "prompt", type_hint=str | list[str], description="The prompt or prompts to guide the image generation" ), "prompt_2": InputParam( "prompt_2", type_hint=str | list[str], description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2", ), "negative_prompt": InputParam( "negative_prompt", type_hint=str | list[str], description="The prompt or prompts not to guide the image generation", ), "negative_prompt_2": InputParam( "negative_prompt_2", type_hint=str | list[str], description="The negative prompt or prompts for text_encoder_2", ), "cross_attention_kwargs": InputParam( "cross_attention_kwargs", type_hint=dict | None, description="Kwargs dictionary passed to the AttentionProcessor", ), "clip_skip": InputParam( "clip_skip", type_hint=int | None, description="Number of layers to skip in CLIP text encoder" ), "image": InputParam( "image", type_hint=PipelineImageInput, required=True, description="The image(s) to modify for img2img or inpainting", ), "mask_image": InputParam( "mask_image", type_hint=PipelineImageInput, required=True, description="Mask image for inpainting, white pixels will be repainted", ), "generator": InputParam( "generator", type_hint=torch.Generator | list[torch.Generator] | None, description="Generator(s) for deterministic generation", ), "height": InputParam("height", type_hint=int | None, description="Height in pixels of the generated image"), "width": InputParam("width", type_hint=int | None, description="Width in pixels of the generated image"), "num_images_per_prompt": InputParam( "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt" ), "num_inference_steps": InputParam( "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps" ), "timesteps": InputParam( "timesteps", type_hint=torch.Tensor | None, description="Custom timesteps for the denoising process" ), "sigmas": InputParam( "sigmas", type_hint=torch.Tensor | None, description="Custom sigmas for the denoising process" ), "denoising_end": InputParam( "denoising_end", type_hint=float | None, description="Fraction of denoising process to complete before termination", ), # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999 "strength": InputParam( "strength", type_hint=float, default=0.3, description="How much to transform the reference image" ), "denoising_start": InputParam( "denoising_start", type_hint=float | None, description="Starting point of the denoising process" ), "latents": InputParam( "latents", type_hint=torch.Tensor | None, description="Pre-generated noisy latents for image generation" ), "padding_mask_crop": InputParam( "padding_mask_crop", type_hint=tuple[int, int] | None, description="Size of margin in crop for image and mask", ), "original_size": InputParam( "original_size", type_hint=tuple[int, int] | None, description="Original size of the image for SDXL's micro-conditioning", ), "target_size": InputParam( "target_size", type_hint=tuple[int, int] | None, description="Target size for SDXL's micro-conditioning" ), "negative_original_size": InputParam( "negative_original_size", type_hint=tuple[int, int] | None, description="Negative conditioning based on image resolution", ), "negative_target_size": InputParam( "negative_target_size", type_hint=tuple[int, int] | None, description="Negative conditioning based on target resolution", ), "crops_coords_top_left": InputParam( "crops_coords_top_left", type_hint=tuple[int, int], default=(0, 0), description="Top-left coordinates for SDXL's micro-conditioning", ), "negative_crops_coords_top_left": InputParam( "negative_crops_coords_top_left", type_hint=tuple[int, int], default=(0, 0), description="Negative conditioning crop coordinates", ), "aesthetic_score": InputParam( "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image" ), "negative_aesthetic_score": InputParam( "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score" ), "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"), "output_type": InputParam( "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)" ), "ip_adapter_image": InputParam( "ip_adapter_image", type_hint=PipelineImageInput, required=True, description="Image(s) to be used as IP adapter", ), "control_image": InputParam( "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition" ), "control_guidance_start": InputParam( "control_guidance_start", type_hint=float | list[float], default=0.0, description="When ControlNet starts applying", ), "control_guidance_end": InputParam( "control_guidance_end", type_hint=float | list[float], default=1.0, description="When ControlNet stops applying", ), "controlnet_conditioning_scale": InputParam( "controlnet_conditioning_scale", type_hint=float | list[float], default=1.0, description="Scale factor for ControlNet outputs", ), "guess_mode": InputParam( "guess_mode", type_hint=bool, default=False, description="Enables ControlNet encoder to recognize input without prompts", ), "control_mode": InputParam( "control_mode", type_hint=list[int], required=True, description="Control mode for union controlnet" ), "prompt_embeds": InputParam( "prompt_embeds", type_hint=torch.Tensor, required=True, description="Text embeddings used to guide image generation", ), "negative_prompt_embeds": InputParam( "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings" ), "pooled_prompt_embeds": InputParam( "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings" ), "negative_pooled_prompt_embeds": InputParam( "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings" ), "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"), "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), "preprocess_kwargs": InputParam( "preprocess_kwargs", type_hint=dict | None, description="Kwargs for ImageProcessor" ), "latent_timestep": InputParam( "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep" ), "image_latents": InputParam( "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image" ), "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"), "masked_image_latents": InputParam( "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting" ), "add_time_ids": InputParam( "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning" ), "negative_add_time_ids": InputParam( "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids" ), "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"), "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"), "crops_coords": InputParam("crops_coords", type_hint=tuple[int] | None, description="Crop coordinates"), "ip_adapter_embeds": InputParam( "ip_adapter_embeds", type_hint=list[torch.Tensor], description="Image embeddings for IP-Adapter" ), "negative_ip_adapter_embeds": InputParam( "negative_ip_adapter_embeds", type_hint=list[torch.Tensor], description="Negative image embeddings for IP-Adapter", ), "images": InputParam( "images", type_hint=list[PIL.Image.Image] | list[torch.Tensor] | list[np.array], required=True, description="Generated images", ), } SDXL_INTERMEDIATE_OUTPUTS_SCHEMA = { "prompt_embeds": OutputParam( "prompt_embeds", type_hint=torch.Tensor, description="Text embeddings used to guide image generation" ), "negative_prompt_embeds": OutputParam( "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings" ), "pooled_prompt_embeds": OutputParam( "pooled_prompt_embeds", type_hint=torch.Tensor, description="Pooled text embeddings" ), "negative_pooled_prompt_embeds": OutputParam( "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings" ), "batch_size": OutputParam("batch_size", type_hint=int, description="Number of prompts"), "dtype": OutputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), "image_latents": OutputParam( "image_latents", type_hint=torch.Tensor, description="Latents representing reference image" ), "mask": OutputParam("mask", type_hint=torch.Tensor, description="Mask for inpainting"), "masked_image_latents": OutputParam( "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting" ), "crops_coords": OutputParam("crops_coords", type_hint=tuple[int] | None, description="Crop coordinates"), "timesteps": OutputParam("timesteps", type_hint=torch.Tensor, description="Timesteps for inference"), "num_inference_steps": OutputParam("num_inference_steps", type_hint=int, description="Number of denoising steps"), "latent_timestep": OutputParam( "latent_timestep", type_hint=torch.Tensor, description="Initial noise level timestep" ), "add_time_ids": OutputParam("add_time_ids", type_hint=torch.Tensor, description="Time ids for conditioning"), "negative_add_time_ids": OutputParam( "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids" ), "timestep_cond": OutputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"), "latents": OutputParam("latents", type_hint=torch.Tensor, description="Denoised latents"), "noise": OutputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"), "ip_adapter_embeds": OutputParam( "ip_adapter_embeds", type_hint=list[torch.Tensor], description="Image embeddings for IP-Adapter" ), "negative_ip_adapter_embeds": OutputParam( "negative_ip_adapter_embeds", type_hint=list[torch.Tensor], description="Negative image embeddings for IP-Adapter", ), "images": OutputParam( "images", type_hint=list[PIL.Image.Image] | list[torch.Tensor] | list[np.array], description="Generated images", ), } SDXL_OUTPUTS_SCHEMA = { "images": OutputParam( "images", type_hint=tuple[list[PIL.Image.Image] | list[torch.Tensor] | list[np.array]] | StableDiffusionXLPipelineOutput, description="The final generated images", ) }