Upload 5 files

Browse files

Files changed (5) hide show

handler.py +75 -0
model.py +109 -0
noise_scheduler.py +46 -0
requirements.txt +8 -0
unet.pt +3 -0

handler.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from diffusers import AutoencoderKL
+from transformers import CLIPProcessor, CLIPModel
+from model import Model
+from noise_scheduler import NoiseSchedule
+import torch
+import base64
+from typing import Any, Dict
+LDM = True
+image_size = 512
+latent_size = 64
+filters = [64, 128, 256, 512]
+latent_dim = 4
+t_dim = 512
+T = 1000
+depth = 2
+class CLIP:
+    def __init__(self):
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.model.eval()
+        for name, param in self.model.named_parameters():
+            param.requires_grad = False
+    @torch.inference_mode()
+    def embed_images(self, images):
+        image = self.processor(images=images, return_tensors="pt").to(self.model.device)
+        return self.model.get_image_features(**image)
+    @torch.inference_mode()
+    def embed_text(self, text):
+        text = self.processor(text, padding=True, return_tensors="pt").to(self.model.device)
+        return self.model.get_text_features(**text)
+class Inference:
+    def __init__(self):
+        self.clip = CLIP()
+        self.ae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae").to('cuda' if torch.cuda.is_available() else "cpu")
+        self.ae.eval()
+        for name, param in self.ae.named_parameters():
+            param.requires_grad = False
+        self.unet = Model(T=T, filters=[64,128,256,512], t_dim=t_dim, depth=depth, LDM=LDM)
+        self.unet.load_state_dict(torch.load("unet.pt", weights_only=False, map_location=torch.device('cpu')))
+        self.unet.eval()
+        for name, param in self.unet.named_parameters():
+            param.requires_grad = False
+        self.noise_scheduler = NoiseSchedule(T=1000, shape=(4,64,64), ddim_mod=50, trainer_mode=True)
+        self.target_vector = self.clip.embed_text("A photo of a cat")[0]
+        self.target_vector = self.target_vector / self.target_vector.norm(p=2, dim=-1, keepdim=True)
+    @torch.inference_mode()
+    def __call__(self, num_images=8):
+        imgs = self.noise_scheduler.generate(self.unet, num_images=num_images, device='cpu')
+        max_img = None
+        max_score = -1
+        images = []
+        for img in imgs:
+            image = self.ae.decode(img.unsqueeze(0) / self.ae.config.scaling_factor)[0][0].cpu().permute(1,2,0)/2 + 0.5
+            image = torch.clamp(image, 0.0, 1.0)
+            images.append(image)
+        embeddings = self.clip.embed_images(images)
+        scores = (embeddings / embeddings.norm(p=2, dim=-1, keepdim=True)) @ self.target_vector.T
+        i = torch.argmax(scores).item()
+        return images[i], scores[i], scores
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        # path -> repo directory on the endpoint container
+        # you can read files via Path(path)/"unet.pt" if needed
+        self.engine = Inference(prompt="A photo of a cat")
+    def __call__(self) -> Dict[str, Any]:
+        png_bytes, score = self.engine(num_images=1)
+        b64 = base64.b64encode(png_bytes).decode("utf-8")
+        return {"image": b64, "score": float(score)}

model.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class SpatialAttention(nn.Module):
+  def __init__(self, in_c):
+    super().__init__()
+    self.norm = nn.GroupNorm(num_groups=32, num_channels=in_c, eps=1e-6, affine=True)
+    self.Q = nn.Conv2d(in_c, in_c, kernel_size=1, stride=1, padding=0)
+    self.K = nn.Conv2d(in_c, in_c, kernel_size=1, stride=1, padding=0)
+    self.V = nn.Conv2d(in_c, in_c, kernel_size=1, stride=1, padding=0)
+    self.proj = nn.Conv2d(in_c, in_c, kernel_size=1, stride=1, padding=0)
+  def forward(self, x):
+    b, c, h, w = x.shape
+    R = self.norm(x)
+    q, v, k = self.Q(R), self.V(R), self.K(R)
+    q, v, k = q.reshape(b, c, h*w), v.reshape(b, c, h*w), k.reshape(b, c, h*w)
+    q, v, k = q.permute(0, 2, 1), v, k
+    R = torch.bmm(q, k) * (1.0 / math.sqrt(c))
+    R = F.softmax(R, dim=2)
+    R = torch.bmm(v, R)
+    R = R.reshape(b, c, h, w)
+    return self.proj(R) + x
+class ResBlock(nn.Module):
+  def __init__(self, in_c, out_c):
+    super().__init__()
+    self.reshape = False
+    if in_c != out_c:
+      self.reshape = True
+      self.conv_reshape = nn.Conv2d(in_c, out_c, kernel_size=3, stride=1, padding=1)
+    self.norm1 = nn.GroupNorm(num_groups=32, num_channels=out_c, eps=1e-6, affine=True)
+    self.conv1 = nn.Conv2d(out_c, out_c, kernel_size=3, stride=1, padding=1)
+    self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_c, eps=1e-6, affine=True)
+    self.conv2 = nn.Conv2d(out_c, out_c, kernel_size=3, stride=1, padding=1)
+  def forward(self, x):
+    if self.reshape:
+      x = self.conv_reshape(x)
+    res = x
+    x = self.norm1(x)
+    x = x * torch.sigmoid(x)
+    x = self.conv1(x)
+    x = self.norm2(x)
+    x = x * torch.sigmoid(x)
+    x = self.conv2(x)
+    x = x + res
+    return x
+class Model(nn.Module):
+  def __init__(self, T=1000, filters=[32, 64, 96, 128], depth=2, t_dim=512, LDM=False):
+    super().__init__()
+    self.t_dim = t_dim
+    self.T = T
+    self.conv_in = nn.Conv2d(4 + self.t_dim if LDM else 3 + self.t_dim, filters[0], kernel_size=1)
+    self.down = nn.ModuleList([])
+    for i in range(1,len(filters)):
+      block = nn.Module()
+      block.Blocks = nn.ModuleList([ResBlock(filters[i-1], filters[i])])
+      for _ in range(1, depth):
+        block.Blocks.append(ResBlock(filters[i], filters[i]))
+      block.DownSample = nn.Conv2d(filters[i], filters[i], kernel_size=3, stride=2, padding=1)
+      self.down.append(block)
+    self.mid = nn.Sequential(ResBlock(filters[-1], filters[-1]),
+                             SpatialAttention(filters[-1]),
+                             ResBlock(filters[-1], filters[-1]))
+    self.up = nn.ModuleList([])
+    filters = filters[::-1]
+    for i in range(1,len(filters)):
+      block = nn.Module()
+      block.Blocks = nn.ModuleList([ResBlock(filters[i-1]*2, filters[i])])
+      for _ in range(1, depth):
+        block.Blocks.append(ResBlock(filters[i], filters[i]))
+      block.UpSample = nn.Upsample(scale_factor=2, mode="bilinear")
+      self.up.append(block)
+    self.conv_out = nn.Conv2d(filters[-1], 4 if LDM else 3, kernel_size=3, padding=1)
+  def get_sinusoidal_emb(self, t):
+    """ Recieves B 1 shaped t tensor with scalar timesteps, returns B D embeddings """
+    freqs = torch.exp(-math.log(self.T) * torch.arange(start=0, end=self.t_dim // 2, dtype=torch.float32) / (self.t_dim // 2)).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    return torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+  def forward(self, x, t):
+    t_emb = self.get_sinusoidal_emb(t)
+    B, C, H, W = x.shape
+    t_emb = t_emb.unsqueeze(-1).unsqueeze(-1).expand(B, self.t_dim, H, W)
+    x = torch.cat((x,t_emb), 1)
+    x = self.conv_in(x)
+    cache = []
+    for block in self.down:
+      for resblock in block.Blocks:
+        x = resblock(x)
+      cache.append(x.clone())
+      x = block.DownSample(x)
+    x = self.mid(x)
+    for block in self.up:
+      x = block.UpSample(x)
+      x = torch.cat((x, cache.pop()), 1)
+      for resblock in block.Blocks:
+        x = resblock(x)
+    return (self.conv_out(x))

noise_scheduler.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import numpy as np
+import torch
+from IPython.display import clear_output
+class NoiseSchedule:
+  """
+  Handles:
+  - DDIM inference (with a ddim_mod to skip steps)
+  - DDPM inference
+  - Forward Noising
+  - Linear beta schedule
+  - Classifier Free Guidance (w is a hyperparameter for cfg schedule)
+  """
+  def __init__(self, T, std=1, shape=(4, 64, 64), ddim_mod=10, trainer_mode=False):
+    self.T = T
+    self.std = std
+    self.ddim_mod = ddim_mod
+    self.beta = torch.tensor(np.linspace(1e-4, 0.02, T), dtype=torch.float32, device='cpu' if trainer_mode else 'cuda')
+    self.alpha = 1 - self.beta
+    self.alpha_bar = self.alpha.cumprod(dim=0)
+    self.w = torch.full((T,), 7.5, device='cpu' if trainer_mode else 'cuda')
+    self.shape = shape
+  def noise(self, x, t):
+    eps = torch.randn_like(x) * self.std
+    return (self.alpha_bar[t]**0.5) * x + ((1-self.alpha_bar[t])**0.5) * eps, eps
+  def ddim_step(self, xt, t, eps):
+    x0 = (xt - (1 - self.alpha_bar[t]).sqrt() * eps) / self.alpha_bar[t].sqrt()
+    x0 = x0.clamp(-1, 1)
+    # note that eps = (xt - sqrt(abar[t]) * x0) / sqrt(1 - abar[t])
+    xt_1 = self.alpha_bar[max(0,t - self.ddim_mod)].sqrt() * x0 + (1 - self.alpha_bar[max(0,t - self.ddim_mod)]).sqrt() * eps
+    return xt_1
+  def ddpm_step(self, x, eps, t, var=None):
+    var = self.beta[t] if var is None else var
+    return (self.alpha[t]**-0.5) * (x - ((1 - self.alpha_bar[t])**0.5) * eps) + var * torch.randn_like(x)
+  def generate(self, model, num_images=16, device="cuda"):
+    with torch.no_grad():
+      x = torch.randn((num_images, *self.shape), device=device) * self.std
+      for t in range(self.T-1, -1, -self.ddim_mod):
+        t_tensor = torch.full((num_images,),t, device=device)
+        epsilons = model(x, t=t_tensor)
+        x = self.ddim_step(x, t=t, eps=epsilons)
+      return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+transformers
+diffusers
+accelerate
+safetensors
+Pillow
+opencv-python-headless
+numpy

unet.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7045782f1dfbb51037ec23ca06142d4b5d60dedbfd28cafd3dbe5e07cead738
+size 135132829