from typing import List, Optional, Tuple, Union import torch import inspect from diffusers import DDIMScheduler, DiffusionPipeline, ImagePipelineOutput class CondDDIMPipeline(DiffusionPipeline): r""" Pipeline for image generation. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular device, etc.). Parameters: unet ([`UNet2DModel`]): A `UNet2DModel` to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of [`DDPMScheduler`], or [`DDIMScheduler`]. """ model_cpu_offload_seq = "unet" def __init__(self, unet, scheduler): super().__init__() scheduler = DDIMScheduler.from_config(scheduler.config) self.register_modules(unet=unet, scheduler=scheduler) @torch.no_grad() def __call__( self, batch_size: int = 1, image: torch.Tensor = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, num_images_per_cond: Optional[int] = 1, eta: float = 0.0, num_inference_steps: int = 50, use_clipped_model_output: Optional[bool] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ) -> Union[ImagePipelineOutput, Tuple]: r""" The call function to the pipeline for generation. Args: batch_size (`int`, *optional*, defaults to 1): The number of images to generate. image (torch.Tensor): The LR image(s) to condition on. generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to DDIM and `1` corresponds to DDPM. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. use_clipped_model_output (`bool`, *optional*, defaults to `None`): If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed downstream to the scheduler (use `None` for schedulers which don't support this argument). output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. """ # create random tensor of shape latents bs, _, height, width = image.shape # check that generator is on device cuda generator = torch.Generator(device=self._execution_device) latents_shape = (bs * num_images_per_cond, self.unet.config.out_channels, height, width) latents = torch.randn(latents_shape, generator=generator, device=self._execution_device, dtype=self.unet.dtype) latents_dtype = next(self.unet.parameters()).dtype # bring conditional img to device image = torch.cat([image] * num_images_per_cond) image = image.to(device=self.device, dtype=latents_dtype) # set step values self.scheduler.set_timesteps(num_inference_steps) # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature. # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_kwargs = {} if accepts_eta: extra_kwargs["eta"] = eta for t in self.progress_bar(self.scheduler.timesteps): # 1. predict noise model_output latents_input = torch.cat([latents, image], dim=1) latents_input = self.scheduler.scale_model_input(latents_input, t) noise_pred = self.unet(latents_input, t).sample # 2. predict previous mean of image x_t-1 and add variance depending on eta # eta corresponds to η in paper and should be between [0, 1] # do x_t -> x_t-1 latents = self.scheduler.step( noise_pred, t, latents, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator ).prev_sample image = latents.cpu().numpy() if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: return (image,) return ImagePipelineOutput(images=image)