Spaces:

tulsi0897
/

ERA20_stable_diffusion

Build error

App Files Files Community

ERA20_stable_diffusion / app.py

tulsi0897

adding app.py

ef6397b over 2 years ago

raw

history blame contribute delete

9.73 kB

	import gradio as gr
	from PIL import Image
	import IPython.display as display
	import matplotlib.pyplot as plt
	from base64 import b64encode
	import numpy
	import torch
	import torch.nn.functional as F
	from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
	from huggingface_hub import notebook_login

	# For video display:
	from IPython.display import HTML
	from matplotlib import pyplot as plt
	from pathlib import Path
	from PIL import Image
	from torch import autocast
	from torchvision import transforms as tfms
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer, logging
	import os

	torch.manual_seed(1)

	# Supress some unnecessary warnings when loading the CLIPTextModel
	logging.set_verbosity_error()

	# Set device
	torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
	if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"

	# Load the autoencoder model which will be used to decode the latents into image space.
	vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")

	# Load the tokenizer and text encoder to tokenize and encode the text.
	tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
	text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

	# The UNet model for generating the latents.
	unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")

	# The noise scheduler
	scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)

	# To the GPU we go!
	vae = vae.to(torch_device)
	text_encoder = text_encoder.to(torch_device)
	unet = unet.to(torch_device);

	def pil_to_latent(input_im):
	# Single image -> single latent in a batch (so size 1, 4, 64, 64)
	with torch.no_grad():
	latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(torch_device)*2-1) # Note scaling
	return 0.18215 * latent.latent_dist.sample()

	def latents_to_pil(latents):
	# bath of latents -> list of images
	latents = (1 / 0.18215) * latents
	with torch.no_grad():
	image = vae.decode(latents).sample
	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
	images = (image * 255).round().astype("uint8")
	pil_images = [Image.fromarray(image) for image in images]
	return pil_images

	# Prep Scheduler
	def set_timesteps(scheduler, num_inference_steps):
	scheduler.set_timesteps(num_inference_steps)
	scheduler.timesteps = scheduler.timesteps.to(torch.float32) # minor fix to ensure MPS compatibility, fixed in diffusers PR 3925

	def blue_loss(images):
	# How far are the blue channel values to 0.9:
	error = torch.abs(images[:,2] - 0.9).mean() # [:,2] -> all images in batch, only the blue channel
	return error

	def diversity_loss(images):
	# Calculate the pairwise L2 distances between images
	pairwise_distances = torch.norm(images.unsqueeze(1) - images.unsqueeze(0), p=2, dim=3)
	# Encourage diversity by minimizing the mean distance
	diversity_loss = torch.mean(pairwise_distances)
	return diversity_loss

	def red_loss(images):
	# How far are the red channel values to a target value (e.g., 0.7):
	error = torch.abs(images[:, 0] - 0.7).mean() # [:, 0] -> all images in batch, only the red channel
	return error

	def green_loss(images):
	# How far are the green channel values to a target value (e.g., 0.8):
	error = torch.abs(images[:, 1] - 0.8).mean() # [:, 1] -> all images in batch, only the green channel
	return error

	def saturation_loss(images, target_saturation=0.5):
	# Calculate the saturation of each image (based on color intensity)
	saturation = images.max(dim=3)[0] - images.min(dim=3)[0]
	# Calculate the mean absolute difference from the target saturation
	loss = torch.abs(saturation - target_saturation).mean()
	return loss

	def brightness_loss(images, target_brightness=0.6):
	# Calculate the brightness of each image (e.g., average pixel intensity)
	brightness = images.mean(dim=(2, 3))
	# Calculate the mean squared error from the target brightness
	loss = (brightness - target_brightness).pow(2).mean()
	return loss

	def edge_detection_loss(images):
	# Use Sobel filters to compute image gradients in x and y directions
	gradient_x = F.conv2d(images, torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=images.dtype).view(1, 1, 3, 3), padding=1)
	gradient_y = F.conv2d(images, torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=images.dtype).view(1, 1, 3, 3), padding=1)
	# Calculate the magnitude of the gradients
	gradient_magnitude = torch.sqrt(gradient_x2 + gradient_y2)
	# Encourage a specific level of edge presence
	loss = gradient_magnitude.mean()
	return loss

	def noise_regularization_loss(images, noise_std=0.1):
	# Calculate the mean squared error of the image against noisy versions of itself
	noisy_images = images + noise_std * torch.randn_like(images)
	loss = torch.mean((images - noisy_images).pow(2))
	return loss

	def image_generation(prompt, loss_fxn):
	generated_image = []
	seed_list = [8, 16, 32, 64, 128]
	for seed in seed_list:
	latents_values = []
	height = 512 # default height of Stable Diffusion
	width = 512
	num_inference_steps = 50
	guidance_scale = 8 # default width of Stable Diffusion
	num_inference_steps = num_inference_steps
	guidance_scale = guidance_scale
	batch_size = 1
	blue_loss_scale = 200 #param
	generator = torch.manual_seed(seed)

	# Prep text
	text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
	with torch.no_grad():
	text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]

	# And the uncond. input as before:
	max_length = text_input.input_ids.shape[-1]
	uncond_input = tokenizer(
	[""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
	)
	with torch.no_grad():
	uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
	text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

	# Prep Scheduler
	set_timesteps(scheduler, num_inference_steps)

	# Prep latents
	latents = torch.randn(
	(batch_size, unet.in_channels, height // 8, width // 8),
	generator=generator,
	)
	latents = latents.to(torch_device)
	latents = latents * scheduler.init_noise_sigma

	# Loop
	for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
	# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
	latent_model_input = torch.cat([latents] * 2)
	sigma = scheduler.sigmas[i]
	latent_model_input = scheduler.scale_model_input(latent_model_input, t)

	# predict the noise residual
	with torch.no_grad():
	noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]

	# perform CFG
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	#### ADDITIONAL GUIDANCE ###
	if i%5 == 0:
	# Requires grad on the latents
	latents = latents.detach().requires_grad_()

	# Get the predicted x0:
	latents_x0 = latents - sigma * noise_pred
	#latents_x0 = scheduler.step(noise_pred, t, latents).pred_original_sample

	# Decode to image space
	denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)

	# Calculate loss
	loss = blue_loss(denoised_images) * blue_loss_scale

	# Occasionally print it out
	# if i%10==0:
	# print(i, 'loss:', loss.item())

	# Get gradient
	cond_grad = torch.autograd.grad(loss, latents)[0]

	# Modify the latents based on this gradient
	latents = latents.detach() - cond_grad * sigma**2

	# Now step with scheduler
	latents = scheduler.step(noise_pred, t, latents).prev_sample
	generated_image.append(latents_to_pil(latents)[0])
	latents_values.append(latents)

	return generated_image, latents_values


	# Create a Gradio interface
	iface = gr.Interface(
	fn=image_generation,
	inputs=[
	# gr.inputs.CheckboxGroup(
	# label="Seed List", choices=[8, 32, 64, 128, 256], type="number"
	# ),
	gr.inputs.Textbox(label="Prompt Input"),
	gr.inputs.Radio(
	label="Loss Function",
	choices=[
	"Diversity Loss",
	"Saturation Loss",
	"Brightness Loss",
	"Edge Detection Loss",
	"Noise Regularization Loss",
	"Blue Loss",
	"Red Loss",
	"Green Loss"
	],
	),
	],
	outputs=gr.outputs.Image(type="pil", label="Generated Images"),
	title="Stable Diffusion Guided by Loss Function Image Generation with Gradio",
	description="Enter parameters to generate images using Stable Diffusion with optional loss functions.",
	)

	# Launch the Gradio interface
	iface.launch()