Spaces:

tulsi0897
/

ERA20_stable_diffusion

Build error

App Files Files Community

tulsi0897 commited on Oct 14, 2023

Commit

ef6397b

1 Parent(s): 9e4e89a

adding app.py

Browse files

Files changed (1) hide show

app.py +240 -0

app.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import gradio as gr
+from PIL import Image
+import IPython.display as display
+import matplotlib.pyplot as plt
+from base64 import b64encode
+import numpy
+import torch
+import torch.nn.functional as F
+from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
+from huggingface_hub import notebook_login
+# For video display:
+from IPython.display import HTML
+from matplotlib import pyplot as plt
+from pathlib import Path
+from PIL import Image
+from torch import autocast
+from torchvision import transforms as tfms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer, logging
+import os
+torch.manual_seed(1)
+# Supress some unnecessary warnings when loading the CLIPTextModel
+logging.set_verbosity_error()
+# Set device
+torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
+# Load the autoencoder model which will be used to decode the latents into image space.
+vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
+# Load the tokenizer and text encoder to tokenize and encode the text.
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+# The UNet model for generating the latents.
+unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
+# The noise scheduler
+scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+# To the GPU we go!
+vae = vae.to(torch_device)
+text_encoder = text_encoder.to(torch_device)
+unet = unet.to(torch_device);
+def pil_to_latent(input_im):
+    # Single image -> single latent in a batch (so size 1, 4, 64, 64)
+    with torch.no_grad():
+        latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(torch_device)*2-1) # Note scaling
+    return 0.18215 * latent.latent_dist.sample()
+def latents_to_pil(latents):
+    # bath of latents -> list of images
+    latents = (1 / 0.18215) * latents
+    with torch.no_grad():
+        image = vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+# Prep Scheduler
+def set_timesteps(scheduler, num_inference_steps):
+    scheduler.set_timesteps(num_inference_steps)
+    scheduler.timesteps = scheduler.timesteps.to(torch.float32) # minor fix to ensure MPS compatibility, fixed in diffusers PR 3925
+def blue_loss(images):
+    # How far are the blue channel values to 0.9:
+    error = torch.abs(images[:,2] - 0.9).mean() # [:,2] -> all images in batch, only the blue channel
+    return error
+def diversity_loss(images):
+    # Calculate the pairwise L2 distances between images
+    pairwise_distances = torch.norm(images.unsqueeze(1) - images.unsqueeze(0), p=2, dim=3)
+    # Encourage diversity by minimizing the mean distance
+    diversity_loss = torch.mean(pairwise_distances)
+    return diversity_loss
+def red_loss(images):
+    # How far are the red channel values to a target value (e.g., 0.7):
+    error = torch.abs(images[:, 0] - 0.7).mean()  # [:, 0] -> all images in batch, only the red channel
+    return error
+def green_loss(images):
+    # How far are the green channel values to a target value (e.g., 0.8):
+    error = torch.abs(images[:, 1] - 0.8).mean()  # [:, 1] -> all images in batch, only the green channel
+    return error
+def saturation_loss(images, target_saturation=0.5):
+    # Calculate the saturation of each image (based on color intensity)
+    saturation = images.max(dim=3)[0] - images.min(dim=3)[0]
+    # Calculate the mean absolute difference from the target saturation
+    loss = torch.abs(saturation - target_saturation).mean()
+    return loss
+def brightness_loss(images, target_brightness=0.6):
+    # Calculate the brightness of each image (e.g., average pixel intensity)
+    brightness = images.mean(dim=(2, 3))
+    # Calculate the mean squared error from the target brightness
+    loss = (brightness - target_brightness).pow(2).mean()
+    return loss
+def edge_detection_loss(images):
+    # Use Sobel filters to compute image gradients in x and y directions
+    gradient_x = F.conv2d(images, torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=images.dtype).view(1, 1, 3, 3), padding=1)
+    gradient_y = F.conv2d(images, torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=images.dtype).view(1, 1, 3, 3), padding=1)
+    # Calculate the magnitude of the gradients
+    gradient_magnitude = torch.sqrt(gradient_x**2 + gradient_y**2)
+    # Encourage a specific level of edge presence
+    loss = gradient_magnitude.mean()
+    return loss
+def noise_regularization_loss(images, noise_std=0.1):
+    # Calculate the mean squared error of the image against noisy versions of itself
+    noisy_images = images + noise_std * torch.randn_like(images)
+    loss = torch.mean((images - noisy_images).pow(2))
+    return loss
+def image_generation(prompt, loss_fxn):
+    generated_image = []
+    seed_list = [8, 16, 32, 64, 128]
+    for seed in seed_list:
+        latents_values = []
+        height = 512                        # default height of Stable Diffusion
+        width = 512
+        num_inference_steps = 50
+        guidance_scale = 8                    # default width of Stable Diffusion
+        num_inference_steps = num_inference_steps
+        guidance_scale = guidance_scale
+        batch_size = 1
+        blue_loss_scale = 200 #param
+        generator = torch.manual_seed(seed)
+        # Prep text
+        text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+        with torch.no_grad():
+            text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
+        # And the uncond. input as before:
+        max_length = text_input.input_ids.shape[-1]
+        uncond_input = tokenizer(
+            [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+        )
+        with torch.no_grad():
+            uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # Prep Scheduler
+        set_timesteps(scheduler, num_inference_steps)
+        # Prep latents
+        latents = torch.randn(
+            (batch_size, unet.in_channels, height // 8, width // 8),
+            generator=generator,
+            )
+        latents = latents.to(torch_device)
+        latents = latents * scheduler.init_noise_sigma
+        # Loop
+        for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latent_model_input = torch.cat([latents] * 2)
+            sigma = scheduler.sigmas[i]
+            latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
+            # perform CFG
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            #### ADDITIONAL GUIDANCE ###
+            if i%5 == 0:
+                # Requires grad on the latents
+                latents = latents.detach().requires_grad_()
+                # Get the predicted x0:
+                latents_x0 = latents - sigma * noise_pred
+                #latents_x0 = scheduler.step(noise_pred, t, latents).pred_original_sample
+                # Decode to image space
+                denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
+                # Calculate loss
+                loss = blue_loss(denoised_images) * blue_loss_scale
+                # Occasionally print it out
+                # if i%10==0:
+                #     print(i, 'loss:', loss.item())
+                # Get gradient
+                cond_grad = torch.autograd.grad(loss, latents)[0]
+                # Modify the latents based on this gradient
+                latents = latents.detach() - cond_grad * sigma**2
+            # Now step with scheduler
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+        generated_image.append(latents_to_pil(latents)[0])
+        latents_values.append(latents)
+    return generated_image, latents_values
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=image_generation,
+    inputs=[
+        # gr.inputs.CheckboxGroup(
+        #     label="Seed List", choices=[8, 32, 64, 128, 256], type="number"
+        # ),
+        gr.inputs.Textbox(label="Prompt Input"),
+        gr.inputs.Radio(
+            label="Loss Function",
+            choices=[
+                "Diversity Loss",
+                "Saturation Loss",
+                "Brightness Loss",
+                "Edge Detection Loss",
+                "Noise Regularization Loss",
+                "Blue Loss",
+                "Red Loss",
+                "Green Loss"
+            ],
+        ),
+    ],
+    outputs=gr.outputs.Image(type="pil", label="Generated Images"),
+    title="Stable Diffusion Guided by Loss Function Image Generation with Gradio",
+    description="Enter parameters to generate images using Stable Diffusion with optional loss functions.",
+)
+# Launch the Gradio interface
+iface.launch()