| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import numpy as np |
| import PIL |
| import torch |
|
|
| from ...image_processor import PipelineImageInput |
| from ...loaders import ModularIPAdapterMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin |
| from ...pipelines.pipeline_utils import StableDiffusionMixin |
| from ...pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput |
| from ...utils import logging |
| from ..modular_pipeline import ModularPipeline |
| from ..modular_pipeline_utils import InputParam, OutputParam |
|
|
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| class StableDiffusionXLModularPipeline( |
| ModularPipeline, |
| StableDiffusionMixin, |
| TextualInversionLoaderMixin, |
| StableDiffusionXLLoraLoaderMixin, |
| ModularIPAdapterMixin, |
| ): |
| """ |
| A ModularPipeline for Stable Diffusion XL. |
| |
| > [!WARNING] > This is an experimental feature and is likely to change in the future. |
| """ |
|
|
| default_blocks_name = "StableDiffusionXLAutoBlocks" |
|
|
| @property |
| def default_height(self): |
| return self.default_sample_size * self.vae_scale_factor |
|
|
| @property |
| def default_width(self): |
| return self.default_sample_size * self.vae_scale_factor |
|
|
| @property |
| def default_sample_size(self): |
| default_sample_size = 128 |
| if hasattr(self, "unet") and self.unet is not None: |
| default_sample_size = self.unet.config.sample_size |
| return default_sample_size |
|
|
| @property |
| def vae_scale_factor(self): |
| vae_scale_factor = 8 |
| if hasattr(self, "vae") and self.vae is not None: |
| vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) |
| return vae_scale_factor |
|
|
| |
| @property |
| def num_channels_unet(self): |
| num_channels_unet = 4 |
| if hasattr(self, "unet") and self.unet is not None: |
| num_channels_unet = self.unet.config.in_channels |
| return num_channels_unet |
|
|
| @property |
| def num_channels_latents(self): |
| num_channels_latents = 4 |
| if hasattr(self, "vae") and self.vae is not None: |
| num_channels_latents = self.vae.config.latent_channels |
| return num_channels_latents |
|
|
|
|
| |
| |
| SDXL_INPUTS_SCHEMA = { |
| "prompt": InputParam( |
| "prompt", type_hint=str | list[str], description="The prompt or prompts to guide the image generation" |
| ), |
| "prompt_2": InputParam( |
| "prompt_2", |
| type_hint=str | list[str], |
| description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2", |
| ), |
| "negative_prompt": InputParam( |
| "negative_prompt", |
| type_hint=str | list[str], |
| description="The prompt or prompts not to guide the image generation", |
| ), |
| "negative_prompt_2": InputParam( |
| "negative_prompt_2", |
| type_hint=str | list[str], |
| description="The negative prompt or prompts for text_encoder_2", |
| ), |
| "cross_attention_kwargs": InputParam( |
| "cross_attention_kwargs", |
| type_hint=dict | None, |
| description="Kwargs dictionary passed to the AttentionProcessor", |
| ), |
| "clip_skip": InputParam( |
| "clip_skip", type_hint=int | None, description="Number of layers to skip in CLIP text encoder" |
| ), |
| "image": InputParam( |
| "image", |
| type_hint=PipelineImageInput, |
| required=True, |
| description="The image(s) to modify for img2img or inpainting", |
| ), |
| "mask_image": InputParam( |
| "mask_image", |
| type_hint=PipelineImageInput, |
| required=True, |
| description="Mask image for inpainting, white pixels will be repainted", |
| ), |
| "generator": InputParam( |
| "generator", |
| type_hint=torch.Generator | list[torch.Generator] | None, |
| description="Generator(s) for deterministic generation", |
| ), |
| "height": InputParam("height", type_hint=int | None, description="Height in pixels of the generated image"), |
| "width": InputParam("width", type_hint=int | None, description="Width in pixels of the generated image"), |
| "num_images_per_prompt": InputParam( |
| "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt" |
| ), |
| "num_inference_steps": InputParam( |
| "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps" |
| ), |
| "timesteps": InputParam( |
| "timesteps", type_hint=torch.Tensor | None, description="Custom timesteps for the denoising process" |
| ), |
| "sigmas": InputParam( |
| "sigmas", type_hint=torch.Tensor | None, description="Custom sigmas for the denoising process" |
| ), |
| "denoising_end": InputParam( |
| "denoising_end", |
| type_hint=float | None, |
| description="Fraction of denoising process to complete before termination", |
| ), |
| |
| "strength": InputParam( |
| "strength", type_hint=float, default=0.3, description="How much to transform the reference image" |
| ), |
| "denoising_start": InputParam( |
| "denoising_start", type_hint=float | None, description="Starting point of the denoising process" |
| ), |
| "latents": InputParam( |
| "latents", type_hint=torch.Tensor | None, description="Pre-generated noisy latents for image generation" |
| ), |
| "padding_mask_crop": InputParam( |
| "padding_mask_crop", |
| type_hint=tuple[int, int] | None, |
| description="Size of margin in crop for image and mask", |
| ), |
| "original_size": InputParam( |
| "original_size", |
| type_hint=tuple[int, int] | None, |
| description="Original size of the image for SDXL's micro-conditioning", |
| ), |
| "target_size": InputParam( |
| "target_size", type_hint=tuple[int, int] | None, description="Target size for SDXL's micro-conditioning" |
| ), |
| "negative_original_size": InputParam( |
| "negative_original_size", |
| type_hint=tuple[int, int] | None, |
| description="Negative conditioning based on image resolution", |
| ), |
| "negative_target_size": InputParam( |
| "negative_target_size", |
| type_hint=tuple[int, int] | None, |
| description="Negative conditioning based on target resolution", |
| ), |
| "crops_coords_top_left": InputParam( |
| "crops_coords_top_left", |
| type_hint=tuple[int, int], |
| default=(0, 0), |
| description="Top-left coordinates for SDXL's micro-conditioning", |
| ), |
| "negative_crops_coords_top_left": InputParam( |
| "negative_crops_coords_top_left", |
| type_hint=tuple[int, int], |
| default=(0, 0), |
| description="Negative conditioning crop coordinates", |
| ), |
| "aesthetic_score": InputParam( |
| "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image" |
| ), |
| "negative_aesthetic_score": InputParam( |
| "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score" |
| ), |
| "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"), |
| "output_type": InputParam( |
| "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)" |
| ), |
| "ip_adapter_image": InputParam( |
| "ip_adapter_image", |
| type_hint=PipelineImageInput, |
| required=True, |
| description="Image(s) to be used as IP adapter", |
| ), |
| "control_image": InputParam( |
| "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition" |
| ), |
| "control_guidance_start": InputParam( |
| "control_guidance_start", |
| type_hint=float | list[float], |
| default=0.0, |
| description="When ControlNet starts applying", |
| ), |
| "control_guidance_end": InputParam( |
| "control_guidance_end", |
| type_hint=float | list[float], |
| default=1.0, |
| description="When ControlNet stops applying", |
| ), |
| "controlnet_conditioning_scale": InputParam( |
| "controlnet_conditioning_scale", |
| type_hint=float | list[float], |
| default=1.0, |
| description="Scale factor for ControlNet outputs", |
| ), |
| "guess_mode": InputParam( |
| "guess_mode", |
| type_hint=bool, |
| default=False, |
| description="Enables ControlNet encoder to recognize input without prompts", |
| ), |
| "control_mode": InputParam( |
| "control_mode", type_hint=list[int], required=True, description="Control mode for union controlnet" |
| ), |
| "prompt_embeds": InputParam( |
| "prompt_embeds", |
| type_hint=torch.Tensor, |
| required=True, |
| description="Text embeddings used to guide image generation", |
| ), |
| "negative_prompt_embeds": InputParam( |
| "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings" |
| ), |
| "pooled_prompt_embeds": InputParam( |
| "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings" |
| ), |
| "negative_pooled_prompt_embeds": InputParam( |
| "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings" |
| ), |
| "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"), |
| "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), |
| "preprocess_kwargs": InputParam( |
| "preprocess_kwargs", type_hint=dict | None, description="Kwargs for ImageProcessor" |
| ), |
| "latent_timestep": InputParam( |
| "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep" |
| ), |
| "image_latents": InputParam( |
| "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image" |
| ), |
| "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"), |
| "masked_image_latents": InputParam( |
| "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting" |
| ), |
| "add_time_ids": InputParam( |
| "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning" |
| ), |
| "negative_add_time_ids": InputParam( |
| "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids" |
| ), |
| "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"), |
| "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"), |
| "crops_coords": InputParam("crops_coords", type_hint=tuple[int] | None, description="Crop coordinates"), |
| "ip_adapter_embeds": InputParam( |
| "ip_adapter_embeds", type_hint=list[torch.Tensor], description="Image embeddings for IP-Adapter" |
| ), |
| "negative_ip_adapter_embeds": InputParam( |
| "negative_ip_adapter_embeds", |
| type_hint=list[torch.Tensor], |
| description="Negative image embeddings for IP-Adapter", |
| ), |
| "images": InputParam( |
| "images", |
| type_hint=list[PIL.Image.Image] | list[torch.Tensor] | list[np.array], |
| required=True, |
| description="Generated images", |
| ), |
| } |
|
|
|
|
| SDXL_INTERMEDIATE_OUTPUTS_SCHEMA = { |
| "prompt_embeds": OutputParam( |
| "prompt_embeds", type_hint=torch.Tensor, description="Text embeddings used to guide image generation" |
| ), |
| "negative_prompt_embeds": OutputParam( |
| "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings" |
| ), |
| "pooled_prompt_embeds": OutputParam( |
| "pooled_prompt_embeds", type_hint=torch.Tensor, description="Pooled text embeddings" |
| ), |
| "negative_pooled_prompt_embeds": OutputParam( |
| "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings" |
| ), |
| "batch_size": OutputParam("batch_size", type_hint=int, description="Number of prompts"), |
| "dtype": OutputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), |
| "image_latents": OutputParam( |
| "image_latents", type_hint=torch.Tensor, description="Latents representing reference image" |
| ), |
| "mask": OutputParam("mask", type_hint=torch.Tensor, description="Mask for inpainting"), |
| "masked_image_latents": OutputParam( |
| "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting" |
| ), |
| "crops_coords": OutputParam("crops_coords", type_hint=tuple[int] | None, description="Crop coordinates"), |
| "timesteps": OutputParam("timesteps", type_hint=torch.Tensor, description="Timesteps for inference"), |
| "num_inference_steps": OutputParam("num_inference_steps", type_hint=int, description="Number of denoising steps"), |
| "latent_timestep": OutputParam( |
| "latent_timestep", type_hint=torch.Tensor, description="Initial noise level timestep" |
| ), |
| "add_time_ids": OutputParam("add_time_ids", type_hint=torch.Tensor, description="Time ids for conditioning"), |
| "negative_add_time_ids": OutputParam( |
| "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids" |
| ), |
| "timestep_cond": OutputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"), |
| "latents": OutputParam("latents", type_hint=torch.Tensor, description="Denoised latents"), |
| "noise": OutputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"), |
| "ip_adapter_embeds": OutputParam( |
| "ip_adapter_embeds", type_hint=list[torch.Tensor], description="Image embeddings for IP-Adapter" |
| ), |
| "negative_ip_adapter_embeds": OutputParam( |
| "negative_ip_adapter_embeds", |
| type_hint=list[torch.Tensor], |
| description="Negative image embeddings for IP-Adapter", |
| ), |
| "images": OutputParam( |
| "images", |
| type_hint=list[PIL.Image.Image] | list[torch.Tensor] | list[np.array], |
| description="Generated images", |
| ), |
| } |
|
|
|
|
| SDXL_OUTPUTS_SCHEMA = { |
| "images": OutputParam( |
| "images", |
| type_hint=tuple[list[PIL.Image.Image] | list[torch.Tensor] | list[np.array]] | StableDiffusionXLPipelineOutput, |
| description="The final generated images", |
| ) |
| } |
|
|