Spaces:

AkashDataScience
/

SD_Textual_Inversion

Sleeping

App Files Files Community

AkashDataScience commited on Aug 2, 2024

Commit

a67c790

1 Parent(s): 47017f5

Adding guidance

Browse files

Files changed (1) hide show

app.py +121 -11

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 import gradio as gr
 from tqdm import tqdm
 from PIL import Image
 from torchvision import transforms as tfms
 from transformers import CLIPTextModel, CLIPTokenizer, logging
 from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
@@ -53,6 +54,8 @@ token_emb_layer_with_art.load_state_dict({'weight': torch.cat((token_emb_layer.s
                                               tony_diterlizzi_s_planescape_art_embed['<tony-diterlizzi-planescape>'].unsqueeze(0).to(torch_device)))})
 token_emb_layer_with_art = token_emb_layer_with_art.to(torch_device)
 def set_timesteps(scheduler, num_inference_steps):
     scheduler.set_timesteps(num_inference_steps)
     scheduler.timesteps = scheduler.timesteps.to(torch.float32)
@@ -148,7 +151,106 @@ def generate_with_embs(num_inference_steps, guidance_scale, seed, text_input, te
     return latents_to_pil(latents)[0]
-def inference(text, style, inference_step, guidance_scale, seed):
     prompt = text + " the style of " + style_token_dict[style]
     # Tokenize
@@ -165,26 +267,34 @@ def inference(text, style, inference_step, guidance_scale, seed):
     modified_output_embeddings = get_output_embeds(input_embeddings)
     # And generate an image with this:
-    image = generate_with_embs(inference_step, guidance_scale, seed, text_input, modified_output_embeddings)
-    return image
 title = "Stable Diffusion with Textual Inversion"
 description = "A simple Gradio interface to infer Stable Diffusion and generate images with different art style"
-examples = [["A sweet potato farm", 'Concept', 10, 1.5, 1],
-            ["Sky full of cotton candy", 'Realistic', 10, 3.5, 2],
-            ["Kittens in the bathtub", 'Line', 10, 5.5, 3],
-            ["Water skiing on a lake", 'Ricky', 10, 7.5, 4],
-            ["Miniature pet elephant", 'Plane Scape', 10, 9.5, 5]]
 demo = gr.Interface(inference,
                     inputs = [gr.Textbox(label="Prompt", type="text"),
                               gr.Dropdown(label="Style", choices=['Concept', 'Realistic', 'Line',
                                                                   'Ricky', 'Plane Scape'], value="Concept"),
-                              gr.Slider(10, 30, 10, step = 10, label="Inference steps"),
                               gr.Slider(1, 10, 7.5, step = 0.1, label="Guidance scale"),
-                              gr.Slider(0, 10000, 1, step = 1, label="Seed")],
-                    outputs= [gr.Image(width=320, height=320, label="Generated art")],
                     title=title,
                     description=description,
                     examples=examples)

 import gradio as gr
 from tqdm import tqdm
 from PIL import Image
+import torch.nn.functional as F
 from torchvision import transforms as tfms
 from transformers import CLIPTextModel, CLIPTokenizer, logging
 from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
                                               tony_diterlizzi_s_planescape_art_embed['<tony-diterlizzi-planescape>'].unsqueeze(0).to(torch_device)))})
 token_emb_layer_with_art = token_emb_layer_with_art.to(torch_device)
+grayscale_transformer = tfms.Grayscale(num_output_channels=3)
 def set_timesteps(scheduler, num_inference_steps):
     scheduler.set_timesteps(num_inference_steps)
     scheduler.timesteps = scheduler.timesteps.to(torch.float32)
     return latents_to_pil(latents)[0]
+def guide_loss(images, loss_type='grayscale'):
+    # grayscale loss
+    if loss_type == 'grayscale':
+      transformed_imgs = grayscale_transformer(images)
+      error = torch.abs(transformed_imgs - images).mean()
+    # brightness loss
+    elif loss_type == 'bright':
+      transformed_imgs = tfms.functional.adjust_brightness(images, brightness_factor=3)
+      error = torch.abs(transformed_imgs - images).mean()
+    # contrast loss
+    elif loss_type == 'contrast':
+      transformed_imgs = tfms.functional.adjust_contrast(images, contrast_factor=10)
+      error = torch.abs(transformed_imgs - images).mean()
+    # symmetry loss - Flip the image along the width
+    elif loss_type == "symmetry":
+      flipped_image = torch.flip(images, [3])
+      error = F.mse_loss(images, flipped_image)
+    # saturation loss
+    elif loss_type == 'saturation':
+      transformed_imgs = tfms.functional.adjust_saturation(images,saturation_factor = 10)
+      error = torch.abs(transformed_imgs - images).mean()
+    return error
+def generate_with_guide_loss(num_inference_steps, guidance_scale, seed, text_input, text_embeddings, loss_type, loss_scale):
+    height = 512                        # default height of Stable Diffusion
+    width = 512                         # default width of Stable Diffusion
+    generator = torch.manual_seed(seed)   # Seed generator to create the inital latent noise
+    batch_size = 1
+    # And the uncond. input as before:
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+    )
+    with torch.no_grad():
+        uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    # Prep Scheduler
+    set_timesteps(scheduler, num_inference_steps)
+    # Prep latents
+    latents = torch.randn(
+    (batch_size, unet.in_channels, height // 8, width // 8),
+    generator=generator,
+    )
+    latents = latents.to(torch_device)
+    latents = latents * scheduler.init_noise_sigma
+    # Loop
+    for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        sigma = scheduler.sigmas[i]
+        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+        # predict the noise residual
+        with torch.no_grad():
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
+        # perform CFG
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        #### ADDITIONAL GUIDANCE ###
+        if i%5 == 0:
+            # Requires grad on the latents
+            latents = latents.detach().requires_grad_()
+            # Get the predicted x0:
+            latents_x0 = latents - sigma * noise_pred
+            # latents_x0 = scheduler.step(noise_pred, t, latents).pred_original_sample
+            # Decode to image space
+            denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
+            # Calculate loss
+            loss = guide_loss(denoised_images, loss_type) * loss_scale
+            # Occasionally print it out
+            if i%10==0:
+                print(i, 'loss:', loss.item())
+            # Get gradient
+            cond_grad = torch.autograd.grad(loss, latents)[0]
+            # Modify the latents based on this gradient
+            latents = latents.detach() - cond_grad * sigma**2
+        # Now step with scheduler
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    return latents_to_pil(latents)[0]
+def inference(text, style, inference_step, guidance_scale, seed, guidance_method, loss_scale):
     prompt = text + " the style of " + style_token_dict[style]
     # Tokenize
     modified_output_embeddings = get_output_embeds(input_embeddings)
     # And generate an image with this:
+    image_embs = generate_with_embs(inference_step, guidance_scale, seed, text_input, modified_output_embeddings)
+    # Generate an image with guidance
+    image_guide = generate_with_guide_loss(inference_step, guidance_scale, seed, text_input,
+                                           modified_output_embeddings, guidance_method, loss_scale)
+    return image_embs, image_guide
 title = "Stable Diffusion with Textual Inversion"
 description = "A simple Gradio interface to infer Stable Diffusion and generate images with different art style"
+examples = [["A sweet potato farm", 'Concept', 10, 1.5, 1, 'Grayscale', 100],
+            ["Sky full of cotton candy", 'Realistic', 10, 3.5, 2, 'Bright', 200],
+            ["Kittens in the bathtub", 'Line', 10, 5.5, 3, 'Contrast', 300],
+            ["Water skiing on a lake", 'Ricky', 10, 7.5, 4, 'Symmetry', 400],
+            ["Miniature pet elephant", 'Plane Scape', 10, 9.5, 5, 'Saturation', 500]]
 demo = gr.Interface(inference,
                     inputs = [gr.Textbox(label="Prompt", type="text"),
                               gr.Dropdown(label="Style", choices=['Concept', 'Realistic', 'Line',
                                                                   'Ricky', 'Plane Scape'], value="Concept"),
+                              gr.Slider(10, 30, 10, step = 5, label="Inference steps"),
                               gr.Slider(1, 10, 7.5, step = 0.1, label="Guidance scale"),
+                              gr.Slider(0, 10000, 1, step = 1, label="Seed"),
+                              gr.Dropdown(label="Guidance method", choices=['Grayscale', 'Bright', 'Contrast',
+                                                                  'Symmetry', 'Saturation'], value="Concept"),
+                              gr.Slider(100, 10000, 100, step = 1, label="Loss scale")],
+                    outputs= [gr.Image(width=320, height=320, label="Generated art"),
+                              gr.Image(width=320, height=320, label="Generated art with guidance")],
                     title=title,
                     description=description,
                     examples=examples)