| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | from typing import List, Optional, Union |
| |
|
| | import PIL.Image |
| | import torch |
| | from transformers import CLIPTokenizer |
| |
|
| | from ...models import AutoencoderKL, UNet2DConditionModel |
| | from ...schedulers import PNDMScheduler |
| | from ...utils import ( |
| | logging, |
| | replace_example_docstring, |
| | ) |
| | from ...utils.torch_utils import randn_tensor |
| | from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput |
| | from .blip_image_processing import BlipImageProcessor |
| | from .modeling_blip2 import Blip2QFormerModel |
| | from .modeling_ctx_clip import ContextCLIPTextModel |
| |
|
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| | EXAMPLE_DOC_STRING = """ |
| | Examples: |
| | ```py |
| | >>> from diffusers.pipelines import BlipDiffusionPipeline |
| | >>> from diffusers.utils import load_image |
| | >>> import torch |
| | |
| | >>> blip_diffusion_pipe = BlipDiffusionPipeline.from_pretrained( |
| | ... "Salesforce/blipdiffusion", torch_dtype=torch.float16 |
| | ... ).to("cuda") |
| | |
| | |
| | >>> cond_subject = "dog" |
| | >>> tgt_subject = "dog" |
| | >>> text_prompt_input = "swimming underwater" |
| | |
| | >>> cond_image = load_image( |
| | ... "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/dog.jpg" |
| | ... ) |
| | >>> guidance_scale = 7.5 |
| | >>> num_inference_steps = 25 |
| | >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate" |
| | |
| | |
| | >>> output = blip_diffusion_pipe( |
| | ... text_prompt_input, |
| | ... cond_image, |
| | ... cond_subject, |
| | ... tgt_subject, |
| | ... guidance_scale=guidance_scale, |
| | ... num_inference_steps=num_inference_steps, |
| | ... neg_prompt=negative_prompt, |
| | ... height=512, |
| | ... width=512, |
| | ... ).images |
| | >>> output[0].save("image.png") |
| | ``` |
| | """ |
| |
|
| |
|
| | class BlipDiffusionPipeline(DiffusionPipeline): |
| | """ |
| | Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion. |
| | |
| | This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the |
| | library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) |
| | |
| | Args: |
| | tokenizer ([`CLIPTokenizer`]): |
| | Tokenizer for the text encoder |
| | text_encoder ([`ContextCLIPTextModel`]): |
| | Text encoder to encode the text prompt |
| | vae ([`AutoencoderKL`]): |
| | VAE model to map the latents to the image |
| | unet ([`UNet2DConditionModel`]): |
| | Conditional U-Net architecture to denoise the image embedding. |
| | scheduler ([`PNDMScheduler`]): |
| | A scheduler to be used in combination with `unet` to generate image latents. |
| | qformer ([`Blip2QFormerModel`]): |
| | QFormer model to get multi-modal embeddings from the text and image. |
| | image_processor ([`BlipImageProcessor`]): |
| | Image Processor to preprocess and postprocess the image. |
| | ctx_begin_pos (int, `optional`, defaults to 2): |
| | Position of the context token in the text encoder. |
| | """ |
| |
|
| | model_cpu_offload_seq = "qformer->text_encoder->unet->vae" |
| |
|
| | def __init__( |
| | self, |
| | tokenizer: CLIPTokenizer, |
| | text_encoder: ContextCLIPTextModel, |
| | vae: AutoencoderKL, |
| | unet: UNet2DConditionModel, |
| | scheduler: PNDMScheduler, |
| | qformer: Blip2QFormerModel, |
| | image_processor: BlipImageProcessor, |
| | ctx_begin_pos: int = 2, |
| | mean: List[float] = None, |
| | std: List[float] = None, |
| | ): |
| | super().__init__() |
| |
|
| | self.register_modules( |
| | tokenizer=tokenizer, |
| | text_encoder=text_encoder, |
| | vae=vae, |
| | unet=unet, |
| | scheduler=scheduler, |
| | qformer=qformer, |
| | image_processor=image_processor, |
| | ) |
| | self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std) |
| |
|
| | def get_query_embeddings(self, input_image, src_subject): |
| | return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False) |
| |
|
| | |
| | def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20): |
| | rv = [] |
| | for prompt, tgt_subject in zip(prompts, tgt_subjects): |
| | prompt = f"a {tgt_subject} {prompt.strip()}" |
| | |
| | rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps))) |
| |
|
| | return rv |
| |
|
| | |
| | def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None): |
| | shape = (batch_size, num_channels, height, width) |
| | if isinstance(generator, list) and len(generator) != batch_size: |
| | raise ValueError( |
| | f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" |
| | f" size of {batch_size}. Make sure the batch size matches the length of the generators." |
| | ) |
| |
|
| | if latents is None: |
| | latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) |
| | else: |
| | latents = latents.to(device=device, dtype=dtype) |
| |
|
| | |
| | latents = latents * self.scheduler.init_noise_sigma |
| | return latents |
| |
|
| | def encode_prompt(self, query_embeds, prompt, device=None): |
| | device = device or self._execution_device |
| |
|
| | |
| | max_len = self.text_encoder.text_model.config.max_position_embeddings |
| | max_len -= self.qformer.config.num_query_tokens |
| |
|
| | tokenized_prompt = self.tokenizer( |
| | prompt, |
| | padding="max_length", |
| | truncation=True, |
| | max_length=max_len, |
| | return_tensors="pt", |
| | ).to(device) |
| |
|
| | batch_size = query_embeds.shape[0] |
| | ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size |
| |
|
| | text_embeddings = self.text_encoder( |
| | input_ids=tokenized_prompt.input_ids, |
| | ctx_embeddings=query_embeds, |
| | ctx_begin_pos=ctx_begin_pos, |
| | )[0] |
| |
|
| | return text_embeddings |
| |
|
| | @torch.no_grad() |
| | @replace_example_docstring(EXAMPLE_DOC_STRING) |
| | def __call__( |
| | self, |
| | prompt: List[str], |
| | reference_image: PIL.Image.Image, |
| | source_subject_category: List[str], |
| | target_subject_category: List[str], |
| | latents: Optional[torch.Tensor] = None, |
| | guidance_scale: float = 7.5, |
| | height: int = 512, |
| | width: int = 512, |
| | num_inference_steps: int = 50, |
| | generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, |
| | neg_prompt: Optional[str] = "", |
| | prompt_strength: float = 1.0, |
| | prompt_reps: int = 20, |
| | output_type: Optional[str] = "pil", |
| | return_dict: bool = True, |
| | ): |
| | """ |
| | Function invoked when calling the pipeline for generation. |
| | |
| | Args: |
| | prompt (`List[str]`): |
| | The prompt or prompts to guide the image generation. |
| | reference_image (`PIL.Image.Image`): |
| | The reference image to condition the generation on. |
| | source_subject_category (`List[str]`): |
| | The source subject category. |
| | target_subject_category (`List[str]`): |
| | The target subject category. |
| | latents (`torch.Tensor`, *optional*): |
| | Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image |
| | generation. Can be used to tweak the same generation with different prompts. If not provided, a latents |
| | tensor will ge generated by random sampling. |
| | guidance_scale (`float`, *optional*, defaults to 7.5): |
| | Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). |
| | `guidance_scale` is defined as `w` of equation 2. of [Imagen |
| | Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > |
| | 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, |
| | usually at the expense of lower image quality. |
| | height (`int`, *optional*, defaults to 512): |
| | The height of the generated image. |
| | width (`int`, *optional*, defaults to 512): |
| | The width of the generated image. |
| | num_inference_steps (`int`, *optional*, defaults to 50): |
| | The number of denoising steps. More denoising steps usually lead to a higher quality image at the |
| | expense of slower inference. |
| | generator (`torch.Generator` or `List[torch.Generator]`, *optional*): |
| | One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) |
| | to make generation deterministic. |
| | neg_prompt (`str`, *optional*, defaults to ""): |
| | The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored |
| | if `guidance_scale` is less than `1`). |
| | prompt_strength (`float`, *optional*, defaults to 1.0): |
| | The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps |
| | to amplify the prompt. |
| | prompt_reps (`int`, *optional*, defaults to 20): |
| | The number of times the prompt is repeated along with prompt_strength to amplify the prompt. |
| | output_type (`str`, *optional*, defaults to `"pil"`): |
| | The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` |
| | (`np.array`) or `"pt"` (`torch.Tensor`). |
| | return_dict (`bool`, *optional*, defaults to `True`): |
| | Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. |
| | Examples: |
| | |
| | Returns: |
| | [`~pipelines.ImagePipelineOutput`] or `tuple` |
| | """ |
| | device = self._execution_device |
| |
|
| | reference_image = self.image_processor.preprocess( |
| | reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt" |
| | )["pixel_values"] |
| | reference_image = reference_image.to(device) |
| |
|
| | if isinstance(prompt, str): |
| | prompt = [prompt] |
| | if isinstance(source_subject_category, str): |
| | source_subject_category = [source_subject_category] |
| | if isinstance(target_subject_category, str): |
| | target_subject_category = [target_subject_category] |
| |
|
| | batch_size = len(prompt) |
| |
|
| | prompt = self._build_prompt( |
| | prompts=prompt, |
| | tgt_subjects=target_subject_category, |
| | prompt_strength=prompt_strength, |
| | prompt_reps=prompt_reps, |
| | ) |
| | query_embeds = self.get_query_embeddings(reference_image, source_subject_category) |
| | text_embeddings = self.encode_prompt(query_embeds, prompt, device) |
| | do_classifier_free_guidance = guidance_scale > 1.0 |
| | if do_classifier_free_guidance: |
| | max_length = self.text_encoder.text_model.config.max_position_embeddings |
| |
|
| | uncond_input = self.tokenizer( |
| | [neg_prompt] * batch_size, |
| | padding="max_length", |
| | max_length=max_length, |
| | return_tensors="pt", |
| | ) |
| | uncond_embeddings = self.text_encoder( |
| | input_ids=uncond_input.input_ids.to(device), |
| | ctx_embeddings=None, |
| | )[0] |
| | |
| | |
| | |
| | text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) |
| |
|
| | scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1) |
| | latents = self.prepare_latents( |
| | batch_size=batch_size, |
| | num_channels=self.unet.config.in_channels, |
| | height=height // scale_down_factor, |
| | width=width // scale_down_factor, |
| | generator=generator, |
| | latents=latents, |
| | dtype=self.unet.dtype, |
| | device=device, |
| | ) |
| | |
| | extra_set_kwargs = {} |
| | self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) |
| |
|
| | for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)): |
| | |
| | do_classifier_free_guidance = guidance_scale > 1.0 |
| |
|
| | latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents |
| |
|
| | noise_pred = self.unet( |
| | latent_model_input, |
| | timestep=t, |
| | encoder_hidden_states=text_embeddings, |
| | down_block_additional_residuals=None, |
| | mid_block_additional_residual=None, |
| | )["sample"] |
| |
|
| | |
| | if do_classifier_free_guidance: |
| | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) |
| | noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) |
| |
|
| | latents = self.scheduler.step( |
| | noise_pred, |
| | t, |
| | latents, |
| | )["prev_sample"] |
| |
|
| | image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] |
| | image = self.image_processor.postprocess(image, output_type=output_type) |
| |
|
| | |
| | self.maybe_free_model_hooks() |
| |
|
| | if not return_dict: |
| | return (image,) |
| |
|
| | return ImagePipelineOutput(images=image) |
| |
|