Spaces:

hohonu-vicml
/

DirectedDiffusion

Runtime error

App Files Files Community

eggarsway commited on Mar 15, 2023

Commit

b0369c2

1 Parent(s): 7ea5b16

add DD

Browse files

Files changed (7) hide show

DirectedDiffusion/AttnCore.py +138 -0
DirectedDiffusion/AttnEditorUtils.py +164 -0
DirectedDiffusion/Diffusion.py +149 -0
DirectedDiffusion/Plotter.py +29 -0
DirectedDiffusion/ProgramInfo.py +57 -0
DirectedDiffusion/__init__.py +11 -0
app.py +3 -1

DirectedDiffusion/AttnCore.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+ORIGINAL SOURCE CODE
+https://github.com/huggingface/diffusers/blob/91ddd2a25b848df0fa1262d4f1cd98c7ccb87750/src/diffusers/models/attention.py#L276
+"""
+import torch
+import math
+import copy
+import numpy as np
+def gaussian_2d(x=0, y=0, mx=0, my=0, sx=1, sy=1):
+    """ Gaussian weight
+    Args:
+       x(float): sample x
+       x(float): sample x
+    """
+    return (
+        1
+        / (2 * math.pi * sx * sy)
+        * torch.exp(-((x - mx) ** 2 / (2 * sx ** 2) + (y - my) ** 2 / (2 * sy ** 2)))
+    )
+def init_attention_edit(
+    unet, tokens=[], rios=[], num_trailing_attn=[], length_prompt=0, noise_scale=[],
+):
+    """ To initialize overwritting the attention module.
+    """
+    rios_ = copy.deepcopy(rios)
+    tokens_ = copy.deepcopy(tokens)
+    num_regions = len(rios)
+    num_trailing_attn_ = copy.deepcopy(num_trailing_attn)
+    noise_scale_ = copy.deepcopy(noise_scale)
+    for i in range(num_regions):
+        total_unrelated = length_prompt + 1 + num_trailing_attn[i]
+        tokens_trailing = []
+        for j in range(length_prompt + 1, total_unrelated):  # trailing
+            if j > 76:
+                break
+            tokens_[i].append(j)
+            # tokens_trailing.append(j)
+        tokens_[i] = list(set(tokens_[i]))
+    def new_attention(self, query, key, value):
+        attn_slice = torch.matmul(query, key.transpose(-1, -2)) * self.scale
+        attn_slice = attn_slice.softmax(dim=-1)
+        if self.use_edited_attn and attn_slice.size()[2] == 77:
+            in_tokens = tokens_[:]
+            rios = rios_[:]
+            num_trailing_attn = num_trailing_attn_[:]
+            dim = int(np.sqrt(attn_slice.size()[1]))
+            attn_slice = attn_slice.view(8, dim, dim, 77)
+            # the mask for all interested words in prompt
+            # global_mask = torch.ones_like(attn_slice, dtype=torch.bool)
+            global_mask = torch.zeros_like(attn_slice, dtype=torch.bool)
+            for i in range(num_regions):
+                # region
+                left = int(dim * rios[i][0])
+                right = int(dim * rios[i][1])
+                top = int(dim * rios[i][2])
+                bottom = int(dim * rios[i][3])
+                # add noise to avoid nullity
+                if self.use_add_noise:
+                    tmp = attn_slice[
+                        :, top:bottom, left:right, in_tokens[i]
+                    ].clone() * (noise_scale_[i] * self.noise_level)
+                    # TODO: the uniform distribution on the region
+                    w = tmp.shape[2]
+                    h = tmp.shape[1]
+                    # noise_u = torch.abs(torch.randn(h, w))
+                    x = torch.linspace(0, h, h)
+                    y = torch.linspace(0, w, w)
+                    x, y = torch.meshgrid(x, y, indexing="ij")
+                    noise_g = gaussian_2d(
+                        x,
+                        y,
+                        mx=int(h / 2),
+                        my=int(w / 2),
+                        sx=float(h) / 2.0,
+                        sy=float(w) / 2.0,
+                    )
+                    noise = noise_g
+                    noise = (
+                        noise.unsqueeze(0)
+                        .unsqueeze(-1)
+                        .repeat(tmp.shape[0], 1, 1, tmp.shape[-1])
+                        .to(attn_slice.device)
+                    )
+                    attn_slice[:, top:bottom, left:right, in_tokens[i]] = tmp + noise
+                scale = 0
+                mask = torch.ones_like(attn_slice, dtype=torch.bool)
+                mask[:, :, right:, in_tokens[i]] = False
+                mask[:, :, :left, in_tokens[i]] = False
+                mask[:, :top, :, in_tokens[i]] = False
+                mask[:, bottom:, :, in_tokens[i]] = False
+                global_mask[..., in_tokens[i]] |= mask[..., in_tokens[i]]
+                mask = torch.zeros_like(attn_slice, dtype=torch.bool)
+                mask[:, top:bottom, :, in_tokens[i]] = True
+                mask[:, :, left:right, in_tokens[i]] = True
+                global_mask[..., in_tokens[i]] &= mask[..., in_tokens[i]]
+            zeros_indices = torch.where(global_mask == False)
+            # global_mask = torch.tensor(
+            #     global_mask.clone().detach(), dtype=torch.float16
+            # )
+            global_mask = global_mask.clone().detach().half()
+            global_mask[zeros_indices] = 0.01
+            attn_slice *= global_mask
+            attn_slice = attn_slice.view(8, dim * dim, 77)
+            self.use_add_noise = False
+            self.use_edited_attn = False
+        self.attn = attn_slice.clone()
+        hidden_states = torch.matmul(attn_slice, value)
+        # reshape hidden_states, e.g., torch.Size([1, 4096, 320])
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    for name, module in unet.named_modules():
+        module_name = type(module).__name__
+        if module_name == "CrossAttention":
+            module.last_attn_slice = None
+            module.use_last_attn_slice = False
+            module.use_last_attn_weights = False
+            module.use_edited_attn = False
+            module.save_last_attn_slice = False
+            module.use_add_noise = False
+            module.noise_level = 0.0
+            module.attn = None
+            module._attention = new_attention.__get__(module, type(module))

DirectedDiffusion/AttnEditorUtils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import os
+import numpy as np
+import torchvision
+from PIL import Image
+from transformers import CLIPModel, CLIPTextModel, CLIPTokenizer, CLIPProcessor
+from diffusers import AutoencoderKL, UNet2DConditionModel
+def get_embeds(prompt, clip, clip_tokenizer, device="cuda"):
+    tokens = clip_tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=clip_tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+        return_overflowing_tokens=True,
+    )
+    embeds = clip(tokens.input_ids.to(device)).last_hidden_state
+    return embeds
+@torch.no_grad()
+def get_image_from_latent(vae, latent):
+    latent = latent / 0.18215
+    image = vae.decode(latent.to(vae.dtype)).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).detach().numpy()
+    image = (image[0] * 255).round().astype("uint8")
+    return Image.fromarray(image)
+@torch.no_grad()
+def get_latent_from_image(vae, image, device="cuda"):
+    generator = torch.cuda.manual_seed(798122)
+    # Resize and transpose for numpy b h w c -> torch b c h w
+    # image = image.resize((width, height), resample=Image.Resampling.LANCZOS)
+    image = np.array(image).astype(np.float16) / 255.0 * 2.0 - 1.0
+    image = torch.from_numpy(image[np.newaxis, ...].transpose(0, 3, 1, 2))
+    # If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
+    if image.shape[1] > 3:
+        image = image[:, :3] * image[:, 3:] + (1 - image[:, 3:])
+    # Move image to GPU
+    image = image.to(device)
+    # Encode image
+    init_latent = vae.encode(image).latent_dist.sample(generator=generator) * 0.18215
+    return init_latent
+def load_all_models(model_path_diffusion):
+    clip_tokenizer = CLIPTokenizer.from_pretrained(
+        model_path_diffusion, subfolder="tokenizer"
+    )
+    clip_text_model = CLIPTextModel.from_pretrained(
+        model_path_diffusion, subfolder="text_encoder", torch_dtype=torch.float16
+    )
+    # Init diffusion model
+    auth_token = True  # Replace this with huggingface auth token as a string if model is not already downloaded
+    # model_path_diffusion = "assets/models/stable-diffusion-v1-4"
+    unet = UNet2DConditionModel.from_pretrained(
+        model_path_diffusion,
+        subfolder="unet",
+        use_auth_token=auth_token,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        model_path_diffusion,
+        subfolder="vae",
+        use_auth_token=auth_token,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    )
+    # Move to GPU
+    device = "cuda"
+    unet.to(device)
+    vae.to(device)
+    clip_text_model.to(device)
+    model_bundle = {}
+    model_bundle["unet"] = unet
+    model_bundle["vae"] = vae
+    model_bundle["clip_tokenizer"] = clip_tokenizer
+    model_bundle["clip_text_model"] = clip_text_model
+    return model_bundle
+@torch.no_grad()
+def check_clip_score(clip_model, clip_processor, prompts=[], images=[]):
+    if len(prompts) == 1:
+        dim = 0
+    if len(images) == 1:
+        dim = 1
+    inputs = clip_processor(
+        text=prompts, images=images, return_tensors="pt", padding=True
+    )
+    inputs["pixel_values"] = torch.tensor(
+        inputs["pixel_values"], dtype=clip_model.dtype, device=clip_model.device
+    )
+    inputs["input_ids"] = torch.tensor(inputs["input_ids"], device=clip_model.device)
+    inputs["attention_mask"] = torch.tensor(
+        inputs["attention_mask"], device=clip_model.device
+    )
+    outputs = clip_model(**inputs)
+    a = clip_model.get_image_features(inputs["pixel_values"])
+    b = clip_model.get_text_features(inputs["input_ids"])
+    prob = torch.matmul(a, b.t()).softmax(dim=dim)
+    return prob
+def get_attn(unet, use=True):
+    attn = []
+    for name, module in unet.named_modules():
+        module_name = type(module).__name__
+        if module_name == "CrossAttention" and "attn2" in name:
+            if module.attn.size() == torch.Size([8, 1024, 77]):
+                attn.append(module.attn)
+    attn = torch.cat(attn, dim=0)
+    attn = torch.sum(attn, dim=0)
+    resized = torch.zeros([64, 64, 77])
+    f = torchvision.transforms.Resize(size=(64, 64))
+    for i in range(77):
+        dim = int(np.sqrt(attn.shape[0]))
+        attn_slice = attn[..., i].view(1, dim, dim)
+        resized[..., i] = f(attn_slice)[0]
+    return resized.cpu().numpy()
+def save_attn(unet):
+    for name, module in unet.named_modules():
+        module_name = type(module).__name__
+        if module_name == "CrossAttention" and "attn2" in name:
+            folder = "/tmp"
+            filepath = os.path.join(folder, name + ".pt")
+            torch.save(module.attn, filepath)
+            print(filepath)
+def use_add_noise(unet, level, use=True):
+    for name, module in unet.named_modules():
+        module_name = type(module).__name__
+        if module_name == "CrossAttention":
+            module.use_add_noise = use
+            module.noise_level = level
+def use_edited_attention(unet, use=True):
+    for name, module in unet.named_modules():
+        module_name = type(module).__name__
+        if module_name == "CrossAttention":
+            module.use_edited_attn = use
+def prompt_token(prompt, index):
+    tokens = clip_tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=clip_tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+        return_overflowing_tokens=True,
+    ).input_ids[0]
+    return clip_tokenizer.decode(tokens[index : index + 1])

DirectedDiffusion/Diffusion.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import torch
+import random
+import numpy as np
+import datetime
+from PIL import Image
+from diffusers import LMSDiscreteScheduler
+from tqdm.auto import tqdm
+from torch import autocast
+from difflib import SequenceMatcher
+import DirectedDiffusion
+@torch.no_grad()
+def stablediffusion(
+    model_bundle,
+    attn_editor_bundle={},
+    device="cuda",
+    prompt="",
+    steps=50,
+    seed=None,
+    width=512,
+    height=512,
+    t_start=0,
+    guidance_scale=7.5,
+    init_latents=None,
+    is_save_attn=False,
+    is_save_recons=False,
+    folder = "./",
+):
+    # neural networks
+    unet = model_bundle["unet"]
+    vae = model_bundle["vae"]
+    clip_tokenizer = model_bundle["clip_tokenizer"]
+    clip = model_bundle["clip_text_model"]
+    # attn editor bundle, our stuff
+    num_affected_steps = int(attn_editor_bundle.get("num_affected_steps") or 0)
+    if not num_affected_steps:
+        print("Not using attn editor")
+    else:
+        print("Using attn editor")
+    DirectedDiffusion.AttnCore.init_attention_edit(
+        unet,
+        tokens=attn_editor_bundle.get("edit_index") or [],
+        rios=attn_editor_bundle.get("roi") or [],
+        noise_scale=attn_editor_bundle.get("noise_scale") or [],
+        length_prompt=len(prompt.split(" ")),
+        num_trailing_attn=attn_editor_bundle.get("num_trailing_attn") or [],
+    )
+    # Change size to multiple of 64 to prevent size mismatches inside model
+    width = width - width % 64
+    height = height - height % 64
+    # If seed is None, randomly select seed from 0 to 2^32-1
+    if seed is None:
+        seed = random.randrange(2 ** 32 - 1)
+    generator = torch.cuda.manual_seed(seed)
+    # Set inference timesteps to scheduler
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+    scheduler.set_timesteps(steps)
+    scheduler.timesteps = scheduler.timesteps.half().cuda()
+    noise_weight = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=10,
+    )
+    noise_weight.set_timesteps(num_affected_steps)
+    # if num_affected_steps:
+    #     noise_weight.set_timesteps(num_affected_steps)
+    #     noise_weight.timesteps /= torch.max(noise_weight.timesteps)
+    init_latent = torch.zeros(
+        (1, unet.in_channels, height // 8, width // 8), device=device
+    )
+    t_start = t_start
+    # Generate random normal noise
+    noise = torch.randn(init_latent.shape, generator=generator, device=device)
+    # latent = noise * scheduler.init_noise_sigma
+    latent = scheduler.add_noise(
+        init_latent,
+        noise,
+        torch.tensor(
+            [scheduler.timesteps[t_start]], device=device, dtype=torch.float16
+        ),
+    ).to(device)
+    current_time = datetime.datetime.now()
+    current_time = current_time.strftime("%y%m%d-%H%M%S")
+    folder = os.path.join(folder, current_time+"_internal")
+    if not os.path.exists(folder) and (is_save_attn or is_save_recons):
+        os.makedirs(folder)
+    # Process clip
+    with autocast(device):
+        embeds_uncond = DirectedDiffusion.AttnEditorUtils.get_embeds(
+            "", clip, clip_tokenizer
+        )
+        embeds_cond = DirectedDiffusion.AttnEditorUtils.get_embeds(
+            prompt, clip, clip_tokenizer
+        )
+        timesteps = scheduler.timesteps[t_start:]
+        for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
+            t_index = t
+            latent_model_input = latent
+            latent_model_input = scheduler.scale_model_input(
+                latent_model_input, t
+            ).half()
+            noise_pred_uncond = unet(
+                latent_model_input, t, encoder_hidden_states=embeds_uncond
+            ).sample
+            if i < num_affected_steps:
+                DirectedDiffusion.AttnEditorUtils.use_add_noise(
+                    unet, noise_weight.timesteps[i]
+                )
+                DirectedDiffusion.AttnEditorUtils.use_edited_attention(unet)
+                noise_pred_cond = unet(
+                    latent_model_input, t, encoder_hidden_states=embeds_cond
+                ).sample
+            else:
+                noise_pred_cond = unet(
+                    latent_model_input, t, encoder_hidden_states=embeds_cond
+                ).sample
+            delta = noise_pred_cond - noise_pred_uncond
+            # Perform guidance
+            noise_pred = noise_pred_uncond + guidance_scale * delta
+            latent = scheduler.step(noise_pred, t_index, latent).prev_sample
+            if is_save_attn:
+                filepath = os.path.join(folder, "ca.{:04d}.jpg".format(i))
+                DirectedDiffusion.Plotter.plot_activation(filepath, unet, prompt, clip_tokenizer)
+            if is_save_recons:
+                filepath = os.path.join(folder, "recons.{:04d}.jpg".format(i))
+                recons = DirectedDiffusion.AttnEditorUtils.get_image_from_latent(vae, latent)
+                recons.save(filepath)
+    return DirectedDiffusion.AttnEditorUtils.get_image_from_latent(vae, latent)

DirectedDiffusion/Plotter.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+import torchvision
+import DirectedDiffusion
+plt.rcParams["figure.figsize"] = [float(v)*1.5 for v in plt.rcParams["figure.figsize"]]
+def plot_activation(filepath, unet, prompt, clip_tokenizer):
+    a = DirectedDiffusion.AttnEditorUtils.get_attn(unet)
+    splitted_prompt = prompt.split(" ")
+    n = len(splitted_prompt)
+    start = 0
+    arrs = []
+    for j in range(1):
+        arr = []
+        for i in range(start,start+n):
+            b = a[..., i+1] / (a[..., i+1].max() + 0.001)
+            arr.append(b.T)
+        start += n
+        arr = np.hstack(arr)
+        arrs.append(arr)
+    arrs = np.vstack(arrs).T
+    plt.imshow(arrs, cmap='jet', vmin=0, vmax=.8)
+    plt.title(prompt)
+    plt.savefig(filepath)

DirectedDiffusion/ProgramInfo.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import textwrap
+def get_parser_description():
+    return textwrap.dedent(
+        """\
+        ***Stable Diffusion Local Editor***
+            ** Examples (Sginel run, single region):
+                # To locate the car at top right of the image
+                python ./bin/SdEditorCmd.py -roi "0.5,1.0,0.0,0.5" -ei "1,2,3" -nt "10" -s "2.0" -ns 15 -p "A yellow car on a bridge" -m
+            ** Example (Single run, multiple regions)
+                The following arugment flags are part of regioning strategy controlling the effects of specific region during attention editing step.
+                Their length must be the same, otherwise the program will be terminated.
+                    --num-trailing-attn (-nt) # a string of integers
+                    --noise-scale (-s) # a string of floats
+                    --edit-index (-ei) # multiple strings of integers
+                    --region-of-interest (-roi) # multiple strings of integers
+                # Two region case
+                python ./bin/SdEditorCmd.py -roi "0.4,0.7,0.1,0.5" "0.4,0.7,0.5,0.9" -ei "2,3" "8,9" -nt "30,30" -ns 10 -s "1.0,1.0" -p "A red cube on top of a blue sphere" -m -sd 2483964026830
+            ** Examples (Grid Search):
+                The following arguments are part of the grid search method to speed up the experimental efficiency:
+                    --num-trailing-attn (-nt)
+                    --noise-scale(-s)
+                    --num-affected-steps(-ns)
+                    --diffusion-steps (-ds)
+                # The following command will run four times with varied options of -nt and -ns
+                python ./bin/SdEditorCmd.py -roi "0.5,1.0,0.0,0.5" -ei "1,2,3" -nt 5 10 20 -ns 5 10 -s 2.5 -p "A yellow car running on a bridge" -m
+            ** Others
+                Using -m flag will draw the metadata on the saved image for quick reference.
+                Using -is flag will show the final result after each diffusion run
+            ** Lazy search
+                We offer a lazy grid search command at the initial experiment stage, for instance
+                # for large number of parameters
+                python ./bin/SdEditorCmd.py -roi "0.4,0.7,0.1,0.5" "0.4,0.7,0.5,0.9" -ei "2,3" "8,9" -p "A red cube on top of a blue sphere" -l1
+                # relatively smaller number of parameters
+                python ./bin/SdEditorCmd.py -roi "0.4,0.7,0.1,0.5" "0.4,0.7,0.5,0.9" -ei "2,3" "8,9" -p "A red cube on top of a blue sphere" -l2
+                This also contains -m function
+           See more examples under scripts/sdeditor-example.sh
+            """
+    )

DirectedDiffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from . import Diffusion
+from . import AttnCore
+from . import AttnEditorUtils
+from . import Plotter
+from . import ProgramInfo
+import importlib
+importlib.reload(Diffusion)
+importlib.reload(AttnCore)
+importlib.reload(AttnEditorUtils)
+importlib.reload(Plotter)
+importlib.reload(ProgramInfo)

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
 def greet(name):
-    return "Hello " + name + "!!"
 iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

+import DirectedDiffusion
 import gradio as gr
 def greet(name):
+    text = str(DirectedDiffusion)
+    return "Hello " + name + "!!" + text
 iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()