Ahmed Mohammed

End of training

8fd8fb2 verified about 1 year ago

9.48 kB

	from typing import Optional

	import torch
	from PIL import Image
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer

	from diffusers import AutoencoderKL, DDIMScheduler, DiffusionPipeline, UNet2DConditionModel
	from diffusers.image_processor import VaeImageProcessor
	from diffusers.utils import (
	deprecate,
	)


	class EDICTPipeline(DiffusionPipeline):
	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: DDIMScheduler,
	mixing_coeff: float = 0.93,
	leapfrog_steps: bool = True,
	):
	self.mixing_coeff = mixing_coeff
	self.leapfrog_steps = leapfrog_steps

	super().__init__()
	self.register_modules(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	)

	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

	def _encode_prompt(
	self, prompt: str, negative_prompt: Optional[str] = None, do_classifier_free_guidance: bool = False
	):
	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	prompt_embeds = self.text_encoder(text_inputs.input_ids.to(self.device)).last_hidden_state

	prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=self.device)

	if do_classifier_free_guidance:
	uncond_tokens = "" if negative_prompt is None else negative_prompt

	uncond_input = self.tokenizer(
	uncond_tokens,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(self.device)).last_hidden_state

	prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

	return prompt_embeds

	def denoise_mixing_layer(self, x: torch.Tensor, y: torch.Tensor):
	x = self.mixing_coeff * x + (1 - self.mixing_coeff) * y
	y = self.mixing_coeff * y + (1 - self.mixing_coeff) * x

	return [x, y]

	def noise_mixing_layer(self, x: torch.Tensor, y: torch.Tensor):
	y = (y - (1 - self.mixing_coeff) * x) / self.mixing_coeff
	x = (x - (1 - self.mixing_coeff) * y) / self.mixing_coeff

	return [x, y]

	def _get_alpha_and_beta(self, t: torch.Tensor):
	# as self.alphas_cumprod is always in cpu
	t = int(t)

	alpha_prod = self.scheduler.alphas_cumprod[t] if t >= 0 else self.scheduler.final_alpha_cumprod

	return alpha_prod, 1 - alpha_prod

	def noise_step(
	self,
	base: torch.Tensor,
	model_input: torch.Tensor,
	model_output: torch.Tensor,
	timestep: torch.Tensor,
	):
	prev_timestep = timestep - self.scheduler.config.num_train_timesteps / self.scheduler.num_inference_steps

	alpha_prod_t, beta_prod_t = self._get_alpha_and_beta(timestep)
	alpha_prod_t_prev, beta_prod_t_prev = self._get_alpha_and_beta(prev_timestep)

	a_t = (alpha_prod_t_prev / alpha_prod_t) ** 0.5
	b_t = -a_t * (beta_prod_t0.5) + beta_prod_t_prev0.5

	next_model_input = (base - b_t * model_output) / a_t

	return model_input, next_model_input.to(base.dtype)

	def denoise_step(
	self,
	base: torch.Tensor,
	model_input: torch.Tensor,
	model_output: torch.Tensor,
	timestep: torch.Tensor,
	):
	prev_timestep = timestep - self.scheduler.config.num_train_timesteps / self.scheduler.num_inference_steps

	alpha_prod_t, beta_prod_t = self._get_alpha_and_beta(timestep)
	alpha_prod_t_prev, beta_prod_t_prev = self._get_alpha_and_beta(prev_timestep)

	a_t = (alpha_prod_t_prev / alpha_prod_t) ** 0.5
	b_t = -a_t * (beta_prod_t0.5) + beta_prod_t_prev0.5
	next_model_input = a_t * base + b_t * model_output

	return model_input, next_model_input.to(base.dtype)

	@torch.no_grad()
	def decode_latents(self, latents: torch.Tensor):
	latents = 1 / self.vae.config.scaling_factor * latents
	image = self.vae.decode(latents).sample
	image = (image / 2 + 0.5).clamp(0, 1)
	return image

	@torch.no_grad()
	def prepare_latents(
	self,
	image: Image.Image,
	text_embeds: torch.Tensor,
	timesteps: torch.Tensor,
	guidance_scale: float,
	generator: Optional[torch.Generator] = None,
	):
	do_classifier_free_guidance = guidance_scale > 1.0

	image = image.to(device=self.device, dtype=text_embeds.dtype)
	latent = self.vae.encode(image).latent_dist.sample(generator)

	latent = self.vae.config.scaling_factor * latent

	coupled_latents = [latent.clone(), latent.clone()]

	for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
	coupled_latents = self.noise_mixing_layer(x=coupled_latents[0], y=coupled_latents[1])

	# j - model_input index, k - base index
	for j in range(2):
	k = j ^ 1

	if self.leapfrog_steps:
	if i % 2 == 0:
	k, j = j, k

	model_input = coupled_latents[j]
	base = coupled_latents[k]

	latent_model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input

	noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeds).sample

	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	base, model_input = self.noise_step(
	base=base,
	model_input=model_input,
	model_output=noise_pred,
	timestep=t,
	)

	coupled_latents[k] = model_input

	return coupled_latents

	@torch.no_grad()
	def __call__(
	self,
	base_prompt: str,
	target_prompt: str,
	image: Image.Image,
	guidance_scale: float = 3.0,
	num_inference_steps: int = 50,
	strength: float = 0.8,
	negative_prompt: Optional[str] = None,
	generator: Optional[torch.Generator] = None,
	output_type: Optional[str] = "pil",
	):
	do_classifier_free_guidance = guidance_scale > 1.0

	image = self.image_processor.preprocess(image)

	base_embeds = self._encode_prompt(base_prompt, negative_prompt, do_classifier_free_guidance)
	target_embeds = self._encode_prompt(target_prompt, negative_prompt, do_classifier_free_guidance)

	self.scheduler.set_timesteps(num_inference_steps, self.device)

	t_limit = num_inference_steps - int(num_inference_steps * strength)
	fwd_timesteps = self.scheduler.timesteps[t_limit:]
	bwd_timesteps = fwd_timesteps.flip(0)

	coupled_latents = self.prepare_latents(image, base_embeds, bwd_timesteps, guidance_scale, generator)

	for i, t in tqdm(enumerate(fwd_timesteps), total=len(fwd_timesteps)):
	# j - model_input index, k - base index
	for k in range(2):
	j = k ^ 1

	if self.leapfrog_steps:
	if i % 2 == 1:
	k, j = j, k

	model_input = coupled_latents[j]
	base = coupled_latents[k]

	latent_model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input

	noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=target_embeds).sample

	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	base, model_input = self.denoise_step(
	base=base,
	model_input=model_input,
	model_output=noise_pred,
	timestep=t,
	)

	coupled_latents[k] = model_input

	coupled_latents = self.denoise_mixing_layer(x=coupled_latents[0], y=coupled_latents[1])

	# either one is fine
	final_latent = coupled_latents[0]

	if output_type not in ["latent", "pt", "np", "pil"]:
	deprecation_message = (
	f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
	"`pil`, `np`, `pt`, `latent`"
	)
	deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
	output_type = "np"

	if output_type == "latent":
	image = final_latent
	else:
	image = self.decode_latents(final_latent)
	image = self.image_processor.postprocess(image, output_type=output_type)

	return image