Spaces:

blanchon
/

rgb2x

Running on Zero

App Files Files Community

rgb2x / x2rgb /pipeline_x2rgb.py

blanchon

Initial commit

a9af355 about 1 year ago

raw

history blame

40.3 kB

	import inspect
	from dataclasses import dataclass
	from typing import Callable, List, Optional, Union

	import numpy as np
	import PIL
	import torch
	import torch.nn.functional as F
	from diffusers.configuration_utils import register_to_config
	from diffusers.image_processor import VaeImageProcessor
	from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
	from diffusers.models import AutoencoderKL, UNet2DConditionModel
	from diffusers.pipelines.pipeline_utils import DiffusionPipeline
	from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
	rescale_noise_cfg,
	)
	from diffusers.schedulers import KarrasDiffusionSchedulers
	from diffusers.utils import CONFIG_NAME, BaseOutput, deprecate, logging, randn_tensor
	from transformers import CLIPTextModel, CLIPTokenizer

	logger = logging.get_logger(__name__)


	class VaeImageProcrssorAOV(VaeImageProcessor):
	"""
	Image processor for VAE AOV.

	Args:
	do_resize (`bool`, optional, defaults to `True`):
	Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
	vae_scale_factor (`int`, optional, defaults to `8`):
	VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
	resample (`str`, optional, defaults to `lanczos`):
	Resampling filter to use when resizing the image.
	do_normalize (`bool`, optional, defaults to `True`):
	Whether to normalize the image to [-1,1].
	"""

	config_name = CONFIG_NAME

	@register_to_config
	def __init__(
	self,
	do_resize: bool = True,
	vae_scale_factor: int = 8,
	resample: str = "lanczos",
	do_normalize: bool = True,
	):
	super().__init__()

	def postprocess(
	self,
	image: torch.FloatTensor,
	output_type: str = "pil",
	do_denormalize: Optional[List[bool]] = None,
	do_gamma_correction: bool = True,
	):
	if not isinstance(image, torch.Tensor):
	raise ValueError(
	f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
	)
	if output_type not in ["latent", "pt", "np", "pil"]:
	deprecation_message = (
	f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
	"`pil`, `np`, `pt`, `latent`"
	)
	deprecate(
	"Unsupported output_type",
	"1.0.0",
	deprecation_message,
	standard_warn=False,
	)
	output_type = "np"

	if output_type == "latent":
	return image

	if do_denormalize is None:
	do_denormalize = [self.config.do_normalize] * image.shape[0]

	image = torch.stack(
	[
	self.denormalize(image[i]) if do_denormalize[i] else image[i]
	for i in range(image.shape[0])
	]
	)

	# Gamma correction
	if do_gamma_correction:
	image = torch.pow(image, 1.0 / 2.2)

	if output_type == "pt":
	return image

	image = self.pt_to_numpy(image)

	if output_type == "np":
	return image

	if output_type == "pil":
	return self.numpy_to_pil(image)

	def preprocess_normal(
	self,
	image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
	height: Optional[int] = None,
	width: Optional[int] = None,
	) -> torch.Tensor:
	image = torch.stack([image], axis=0)
	return image


	@dataclass
	class StableDiffusionAOVPipelineOutput(BaseOutput):
	"""
	Output class for Stable Diffusion AOV pipelines.

	Args:
	images (`List[PIL.Image.Image]` or `np.ndarray`)
	List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
	num_channels)`.
	nsfw_content_detected (`List[bool]`)
	List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
	`None` if safety checking could not be performed.
	"""

	images: Union[List[PIL.Image.Image], np.ndarray]
	predicted_x0_images: Optional[Union[List[PIL.Image.Image], np.ndarray]] = None


	class StableDiffusionAOVDropoutPipeline(
	DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin
	):
	r"""
	Pipeline for AOVs.

	This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
	implemented for all pipelines (downloading, saving, running on a particular device, etc.).

	The pipeline also inherits the following loading methods:
	- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
	- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
	- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights

	Args:
	vae ([`AutoencoderKL`]):
	Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
	text_encoder ([`~transformers.CLIPTextModel`]):
	Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
	tokenizer ([`~transformers.CLIPTokenizer`]):
	A `CLIPTokenizer` to tokenize text.
	unet ([`UNet2DConditionModel`]):
	A `UNet2DConditionModel` to denoise the encoded image latents.
	scheduler ([`SchedulerMixin`]):
	A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
	[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
	"""

	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: KarrasDiffusionSchedulers,
	):
	super().__init__()

	self.register_modules(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.image_processor = VaeImageProcrssorAOV(
	vae_scale_factor=self.vae_scale_factor
	)
	self.register_to_config()

	def _encode_prompt(
	self,
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt=None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	):
	r"""
	Encodes the prompt into text encoder hidden states.

	Args:
	prompt (`str` or `List[str]`, optional):
	prompt to be encoded
	device: (`torch.device`):
	torch device
	num_images_per_prompt (`int`):
	number of images that should be generated per prompt
	do_classifier_free_guidance (`bool`):
	whether to use classifier free guidance or not
	negative_ prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
	less than `1`).
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.
	"""
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:
	# textual inversion: procecss multi-vector tokens if necessary
	if isinstance(self, TextualInversionLoaderMixin):
	prompt = self.maybe_convert_prompt(prompt, self.tokenizer)

	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(
	prompt, padding="longest", return_tensors="pt"
	).input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[
	-1
	] and not torch.equal(text_input_ids, untruncated_ids):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
	)
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer.model_max_length} tokens: {removed_text}"
	)

	if (
	hasattr(self.text_encoder.config, "use_attention_mask")
	and self.text_encoder.config.use_attention_mask
	):
	attention_mask = text_inputs.attention_mask.to(device)
	else:
	attention_mask = None

	prompt_embeds = self.text_encoder(
	text_input_ids.to(device),
	attention_mask=attention_mask,
	)
	prompt_embeds = prompt_embeds[0]

	prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)

	bs_embed, seq_len, _ = prompt_embeds.shape
	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(
	bs_embed * num_images_per_prompt, seq_len, -1
	)

	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance and negative_prompt_embeds is None:
	uncond_tokens: List[str]
	if negative_prompt is None:
	uncond_tokens = [""] * batch_size
	elif type(prompt) is not type(negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}."
	)
	elif isinstance(negative_prompt, str):
	uncond_tokens = [negative_prompt]
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`."
	)
	else:
	uncond_tokens = negative_prompt

	# textual inversion: procecss multi-vector tokens if necessary
	if isinstance(self, TextualInversionLoaderMixin):
	uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)

	max_length = prompt_embeds.shape[1]
	uncond_input = self.tokenizer(
	uncond_tokens,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	return_tensors="pt",
	)

	if (
	hasattr(self.text_encoder.config, "use_attention_mask")
	and self.text_encoder.config.use_attention_mask
	):
	attention_mask = uncond_input.attention_mask.to(device)
	else:
	attention_mask = None

	negative_prompt_embeds = self.text_encoder(
	uncond_input.input_ids.to(device),
	attention_mask=attention_mask,
	)
	negative_prompt_embeds = negative_prompt_embeds[0]

	if do_classifier_free_guidance:
	# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
	seq_len = negative_prompt_embeds.shape[1]

	negative_prompt_embeds = negative_prompt_embeds.to(
	dtype=self.text_encoder.dtype, device=device
	)

	negative_prompt_embeds = negative_prompt_embeds.repeat(
	1, num_images_per_prompt, 1
	)
	negative_prompt_embeds = negative_prompt_embeds.view(
	batch_size * num_images_per_prompt, seq_len, -1
	)

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	# pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
	prompt_embeds = torch.cat(
	[prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
	)

	return prompt_embeds

	def prepare_extra_step_kwargs(self, generator, eta):
	# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
	# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
	# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
	# and should be between [0, 1]

	accepts_eta = "eta" in set(
	inspect.signature(self.scheduler.step).parameters.keys()
	)
	extra_step_kwargs = {}
	if accepts_eta:
	extra_step_kwargs["eta"] = eta

	# check if the scheduler accepts generator
	accepts_generator = "generator" in set(
	inspect.signature(self.scheduler.step).parameters.keys()
	)
	if accepts_generator:
	extra_step_kwargs["generator"] = generator
	return extra_step_kwargs

	def check_inputs(
	self,
	prompt,
	callback_steps,
	negative_prompt=None,
	prompt_embeds=None,
	negative_prompt_embeds=None,
	):
	if (callback_steps is None) or (
	callback_steps is not None
	and (not isinstance(callback_steps, int) or callback_steps <= 0)
	):
	raise ValueError(
	f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
	f" {type(callback_steps)}."
	)

	if prompt is not None and prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
	" only forward one of the two."
	)
	elif prompt is None and prompt_embeds is None:
	raise ValueError(
	"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
	)
	elif prompt is not None and (
	not isinstance(prompt, str) and not isinstance(prompt, list)
	):
	raise ValueError(
	f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
	)

	if negative_prompt is not None and negative_prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
	f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
	)

	if prompt_embeds is not None and negative_prompt_embeds is not None:
	if prompt_embeds.shape != negative_prompt_embeds.shape:
	raise ValueError(
	"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
	f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
	f" {negative_prompt_embeds.shape}."
	)

	def prepare_latents(
	self,
	batch_size,
	num_channels_latents,
	height,
	width,
	dtype,
	device,
	generator,
	latents=None,
	):
	shape = (
	batch_size,
	num_channels_latents,
	height // self.vae_scale_factor,
	width // self.vae_scale_factor,
	)
	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	if latents is None:
	latents = randn_tensor(
	shape, generator=generator, device=device, dtype=dtype
	)
	else:
	latents = latents.to(device)

	# scale the initial noise by the standard deviation required by the scheduler
	latents = latents * self.scheduler.init_noise_sigma
	return latents

	def prepare_image_latents(
	self,
	image,
	batch_size,
	num_images_per_prompt,
	dtype,
	device,
	do_classifier_free_guidance,
	generator=None,
	):
	if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
	raise ValueError(
	f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
	)

	image = image.to(device=device, dtype=dtype)

	batch_size = batch_size * num_images_per_prompt

	if image.shape[1] == 4:
	image_latents = image
	else:
	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	if isinstance(generator, list):
	image_latents = [
	self.vae.encode(image[i : i + 1]).latent_dist.mode()
	for i in range(batch_size)
	]
	image_latents = torch.cat(image_latents, dim=0)
	else:
	image_latents = self.vae.encode(image).latent_dist.mode()

	if (
	batch_size > image_latents.shape[0]
	and batch_size % image_latents.shape[0] == 0
	):
	# expand image_latents for batch_size
	deprecation_message = (
	f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
	" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
	" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
	" your script to pass as many initial images as text prompts to suppress this warning."
	)
	deprecate(
	"len(prompt) != len(image)",
	"1.0.0",
	deprecation_message,
	standard_warn=False,
	)
	additional_image_per_prompt = batch_size // image_latents.shape[0]
	image_latents = torch.cat(
	[image_latents] * additional_image_per_prompt, dim=0
	)
	elif (
	batch_size > image_latents.shape[0]
	and batch_size % image_latents.shape[0] != 0
	):
	raise ValueError(
	f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
	)
	else:
	image_latents = torch.cat([image_latents], dim=0)

	if do_classifier_free_guidance:
	uncond_image_latents = torch.zeros_like(image_latents)
	image_latents = torch.cat(
	[image_latents, image_latents, uncond_image_latents], dim=0
	)

	return image_latents

	@torch.no_grad()
	def __call__(
	self,
	height: int,
	width: int,
	prompt: Union[str, List[str]] = None,
	albedo: Optional[
	Union[
	torch.FloatTensor,
	PIL.Image.Image,
	np.ndarray,
	List[torch.FloatTensor],
	List[PIL.Image.Image],
	List[np.ndarray],
	]
	] = None,
	normal: Optional[
	Union[
	torch.FloatTensor,
	PIL.Image.Image,
	np.ndarray,
	List[torch.FloatTensor],
	List[PIL.Image.Image],
	List[np.ndarray],
	]
	] = None,
	roughness: Optional[
	Union[
	torch.FloatTensor,
	PIL.Image.Image,
	np.ndarray,
	List[torch.FloatTensor],
	List[PIL.Image.Image],
	List[np.ndarray],
	]
	] = None,
	metallic: Optional[
	Union[
	torch.FloatTensor,
	PIL.Image.Image,
	np.ndarray,
	List[torch.FloatTensor],
	List[PIL.Image.Image],
	List[np.ndarray],
	]
	] = None,
	irradiance: Optional[
	Union[
	torch.FloatTensor,
	PIL.Image.Image,
	np.ndarray,
	List[torch.FloatTensor],
	List[PIL.Image.Image],
	List[np.ndarray],
	]
	] = None,
	guidance_scale: float = 0.0,
	image_guidance_scale: float = 0.0,
	guidance_rescale: float = 0.0,
	num_inference_steps: int = 100,
	required_aovs: List[str] = ["albedo"],
	return_predicted_x0s: bool = False,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
	callback_steps: int = 1,
	):
	r"""
	The call function to the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
	image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
	`Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
	image latents as `image`, but if passing latents directly it is not encoded again.
	num_inference_steps (`int`, optional, defaults to 100):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	A higher guidance scale value encourages the model to generate images closely linked to the text
	`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
	image_guidance_scale (`float`, optional, defaults to 1.5):
	Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
	`image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
	linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
	value of at least `1`.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide what to not include in image generation. If not defined, you need to
	pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
	to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
	generator (`torch.Generator`, optional):
	A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
	generation deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor is generated by sampling using the supplied random `generator`.
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
	provided, text embeddings are generated from the `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
	not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generated image. Choose between `PIL.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that calls every `callback_steps` steps during inference. The function is called with the
	following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function is called. If not specified, the callback is called at
	every step.

	Examples:

	```py
	>>> import PIL
	>>> import requests
	>>> import torch
	>>> from io import BytesIO

	>>> from diffusers import StableDiffusionInstructPix2PixPipeline


	>>> def download_image(url):
	... response = requests.get(url)
	... return PIL.Image.open(BytesIO(response.content)).convert("RGB")


	>>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"

	>>> image = download_image(img_url).resize((512, 512))

	>>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
	... "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
	... )
	>>> pipe = pipe.to("cuda")

	>>> prompt = "make the mountains snowy"
	>>> image = pipe(prompt=prompt, image=image).images[0]
	```

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
	otherwise a `tuple` is returned where the first element is a list with the generated images and the
	second element is a list of `bool`s indicating whether the corresponding generated image contains
	"not-safe-for-work" (nsfw) content.
	"""
	# 0. Check inputs
	self.check_inputs(
	prompt,
	callback_steps,
	negative_prompt,
	prompt_embeds,
	negative_prompt_embeds,
	)

	# 1. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self._execution_device
	do_classifier_free_guidance = (
	guidance_scale >= 1.0 and image_guidance_scale >= 1.0
	)
	# check if scheduler is in sigmas space
	scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")

	# 2. Encode input prompt
	prompt_embeds = self._encode_prompt(
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	)

	# 3. Preprocess image
	# For normal, the preprocessing does nothing
	# For others, the preprocessing remap the values to [-1, 1]
	preprocessed_aovs = {}
	for aov_name in required_aovs:
	if aov_name == "albedo":
	if albedo is not None:
	preprocessed_aovs[aov_name] = self.image_processor.preprocess(
	albedo
	)
	else:
	preprocessed_aovs[aov_name] = None

	if aov_name == "normal":
	if normal is not None:
	preprocessed_aovs[aov_name] = (
	self.image_processor.preprocess_normal(normal)
	)
	else:
	preprocessed_aovs[aov_name] = None

	if aov_name == "roughness":
	if roughness is not None:
	preprocessed_aovs[aov_name] = self.image_processor.preprocess(
	roughness
	)
	else:
	preprocessed_aovs[aov_name] = None
	if aov_name == "metallic":
	if metallic is not None:
	preprocessed_aovs[aov_name] = self.image_processor.preprocess(
	metallic
	)
	else:
	preprocessed_aovs[aov_name] = None
	if aov_name == "irradiance":
	if irradiance is not None:
	preprocessed_aovs[aov_name] = self.image_processor.preprocess(
	irradiance
	)
	else:
	preprocessed_aovs[aov_name] = None

	# 4. set timesteps
	self.scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.scheduler.timesteps

	# 5. Prepare latent variables
	num_channels_latents = self.vae.config.latent_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)

	height_latent, width_latent = latents.shape[-2:]

	# 6. Prepare Image latents
	image_latents = []
	# Magicial scaling factors for each AOV (calculated from the training data)
	scaling_factors = {
	"albedo": 0.17301377137652138,
	"normal": 0.17483895473058078,
	"roughness": 0.1680724853626448,
	"metallic": 0.13135013390855135,
	}
	for aov_name, aov in preprocessed_aovs.items():
	if aov is None:
	image_latent = torch.zeros(
	batch_size,
	num_channels_latents,
	height_latent,
	width_latent,
	dtype=prompt_embeds.dtype,
	device=device,
	)
	if aov_name == "irradiance":
	image_latent = image_latent[:, 0:3]
	if do_classifier_free_guidance:
	image_latents.append(
	torch.cat([image_latent, image_latent, image_latent], dim=0)
	)
	else:
	image_latents.append(image_latent)
	else:
	if aov_name == "irradiance":
	image_latent = F.interpolate(
	aov.to(device=device, dtype=prompt_embeds.dtype),
	size=(height_latent, width_latent),
	mode="bilinear",
	align_corners=False,
	antialias=True,
	)
	if do_classifier_free_guidance:
	uncond_image_latent = torch.zeros_like(image_latent)
	image_latent = torch.cat(
	[image_latent, image_latent, uncond_image_latent], dim=0
	)
	else:
	scaling_factor = scaling_factors[aov_name]
	image_latent = (
	self.prepare_image_latents(
	aov,
	batch_size,
	num_images_per_prompt,
	prompt_embeds.dtype,
	device,
	do_classifier_free_guidance,
	generator,
	)
	* scaling_factor
	)
	image_latents.append(image_latent)
	image_latents = torch.cat(image_latents, dim=1)

	# 7. Check that shapes of latents and image match the UNet channels
	num_channels_image = image_latents.shape[1]
	if num_channels_latents + num_channels_image != self.unet.config.in_channels:
	raise ValueError(
	f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
	f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
	f" `num_channels_image`: {num_channels_image} "
	f" = {num_channels_latents+num_channels_image}. Please verify the config of"
	" `pipeline.unet` or your `image` input."
	)

	# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	predicted_x0s = []

	# 9. Denoising loop
	num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# Expand the latents if we are doing classifier free guidance.
	# The latents are expanded 3 times because for pix2pix the guidance\
	# is applied for both the text and the input image.
	latent_model_input = (
	torch.cat([latents] * 3) if do_classifier_free_guidance else latents
	)

	# concat latents, image_latents in the channel dimension
	scaled_latent_model_input = self.scheduler.scale_model_input(
	latent_model_input, t
	)
	scaled_latent_model_input = torch.cat(
	[scaled_latent_model_input, image_latents], dim=1
	)

	# predict the noise residual
	noise_pred = self.unet(
	scaled_latent_model_input,
	t,
	encoder_hidden_states=prompt_embeds,
	return_dict=False,
	)[0]

	# perform guidance
	if do_classifier_free_guidance:
	(
	noise_pred_text,
	noise_pred_image,
	noise_pred_uncond,
	) = noise_pred.chunk(3)
	noise_pred = (
	noise_pred_uncond
	+ guidance_scale * (noise_pred_text - noise_pred_image)
	+ image_guidance_scale * (noise_pred_image - noise_pred_uncond)
	)

	if do_classifier_free_guidance and guidance_rescale > 0.0:
	# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
	noise_pred = rescale_noise_cfg(
	noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
	)

	# compute the previous noisy sample x_t -> x_t-1
	output = self.scheduler.step(
	noise_pred, t, latents, **extra_step_kwargs, return_dict=True
	)

	latents = output[0]

	if return_predicted_x0s:
	predicted_x0s.append(output[1])

	# call the callback, if provided
	if i == len(timesteps) - 1 or (
	(i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
	):
	progress_bar.update()
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	if not output_type == "latent":
	image = self.vae.decode(
	latents / self.vae.config.scaling_factor, return_dict=False
	)[0]

	if return_predicted_x0s:
	predicted_x0_images = [
	self.vae.decode(
	predicted_x0 / self.vae.config.scaling_factor, return_dict=False
	)[0]
	for predicted_x0 in predicted_x0s
	]
	else:
	image = latents
	predicted_x0_images = predicted_x0s

	do_denormalize = [True] * image.shape[0]

	image = self.image_processor.postprocess(
	image, output_type=output_type, do_denormalize=do_denormalize
	)

	if return_predicted_x0s:
	predicted_x0_images = [
	self.image_processor.postprocess(
	predicted_x0_image,
	output_type=output_type,
	do_denormalize=do_denormalize,
	)
	for predicted_x0_image in predicted_x0_images
	]

	# Offload last model to CPU
	if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
	self.final_offload_hook.offload()

	if not return_dict:
	return image

	if return_predicted_x0s:
	return StableDiffusionAOVPipelineOutput(
	images=image, predicted_x0_images=predicted_x0_images
	)
	else:
	return StableDiffusionAOVPipelineOutput(images=image)