Spaces:

jblitzar
/

text-to-image

Runtime error

App Files Files Community

JBlitzar commited on Sep 29, 2024

Commit

897982a

1 Parent(s): ed96a20

conmit

Browse files

Files changed (11) hide show

app.py +19 -67
bert_vectorize.py +27 -0
factories.py +343 -0
infer.py +43 -0
logger.py +40 -0
pipeline.py +69 -0
predict.py +54 -0
runner.py +80 -0
runs/run_3_jxa/ckpt/latest.pt +3 -0
runs/run_3_jxa/ckpt/latest_cpu.pt +3 -0
wrapper.py +198 -0

app.py CHANGED Viewed

@@ -2,42 +2,30 @@ import gradio as gr
 import numpy as np
 import random
 #import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo" #Replace to the model you would like to use
 if torch.cuda.is_available():
     torch_dtype = torch.float16
 else:
     torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
 #@spaces.GPU #[uncomment to use ZeroGPU]
-def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
     image = pipe(
-        prompt = prompt,
-        negative_prompt = negative_prompt,
-        guidance_scale = guidance_scale,
-        num_inference_steps = num_inference_steps,
-        width = width,
-        height = height,
-        generator = generator
     ).images[0]
-    return image, seed
 examples = [
     "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
@@ -75,58 +63,22 @@ with gr.Blocks(css=css) as demo:
         with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024, #Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024, #Replace with defaults that work for your model
                 )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0, #Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
                     step=1,
-                    value=2, #Replace with defaults that work for your model
                 )
         gr.Examples(
             examples = examples,
@@ -135,8 +87,8 @@ with gr.Blocks(css=css) as demo:
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn = infer,
-        inputs = [prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
-        outputs = [result, seed]
     )
 demo.queue().launch()

 import numpy as np
 import random
 #import spaces #[uncomment to use ZeroGPU]
+from pipeline import TextToImagePipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if torch.cuda.is_available():
     torch_dtype = torch.float16
 else:
     torch_dtype = torch.float32
+pipe = TextToImagePipeline(device=device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
 #@spaces.GPU #[uncomment to use ZeroGPU]
+def infer(prompt, num_inference_steps, amt, progress=gr.Progress(track_tqdm=True)):
     image = pipe(
+        prompt, num_inference_steps, amt
     ).images[0]
+    return image
 examples = [
     "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
         with gr.Accordion("Advanced Settings", open=False):
+            amt = gr.Slider(
+                    label="Amount",
+                    minimum=1,
+                    maximum=8,
+                    step=1,
+                    value=8,
                 )
+            steps = gr.Slider(
+                    label="Num inference steps",
+                    minimum=100,
+                    maximum=2000,
                     step=1,
+                    value=1000,
                 )
         gr.Examples(
             examples = examples,
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn = infer,
+        inputs = [prompt, steps,amt],
+        outputs = [result]
     )
 demo.queue().launch()

bert_vectorize.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
+import torch
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+model = DistilBertModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
+model.eval()
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+model = model.to(device)
+def vectorize_text_with_bert(text):# from hf docs
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    hidden_states = outputs.hidden_states
+    last_layer_hidden_states = hidden_states[-1]
+    text_representation = torch.mean(last_layer_hidden_states, dim=1).squeeze(0)
+    return text_representation
+if __name__ == "__main__":
+    text = "A man walking down the street with a dog holding a balloon in one hand."
+    text_representation = vectorize_text_with_bert(text)
+    print("Vectorized representation:", text_representation)
+    print(text_representation.shape)

factories.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class EMA:
+    def __init__(self, beta):
+        super().__init__()
+        self.beta = beta
+        self.step = 0
+    def update_model_average(self, ma_model, current_model):
+        for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
+            old_weight, up_weight = ma_params.data, current_params.data
+            ma_params.data = self.update_average(old_weight, up_weight)
+    def update_average(self, old, new):
+        if old is None:
+            return new
+        return old * self.beta + (1 - self.beta) * new
+    def step_ema(self, ema_model, model, step_start_ema=2000):
+        if self.step < step_start_ema:
+            self.reset_parameters(ema_model, model)
+            self.step += 1
+            return
+        self.update_model_average(ema_model, model)
+        self.step += 1
+    def reset_parameters(self, ema_model, model):
+        ema_model.load_state_dict(model.state_dict())
+class SelfAttention(nn.Module):
+    def __init__(self, channels, size):
+        super(SelfAttention, self).__init__()
+        self.channels = channels
+        self.size = size
+        self.mha = nn.MultiheadAttention(channels, 4, batch_first=True)
+        self.ln = nn.LayerNorm([channels])
+        self.ff_self = nn.Sequential(
+            nn.LayerNorm([channels]),
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels),
+        )
+    def forward(self, x):
+        x = x.view(-1, self.channels, self.size * self.size).swapaxes(1, 2)
+        x_ln = self.ln(x)
+        attention_value, _ = self.mha(x_ln, x_ln, x_ln)
+        attention_value = attention_value + x
+        attention_value = self.ff_self(attention_value) + attention_value
+        return attention_value.swapaxes(2, 1).view(-1, self.channels, self.size, self.size)
+class CrossAttention(nn.Module):
+    def __init__(self, channels, size, context_dim):
+        super(CrossAttention, self).__init__()
+        self.channels = channels
+        self.size = size
+        self.context_dim = context_dim
+        self.mha = nn.MultiheadAttention(channels, 4, batch_first=True)
+        self.ln = nn.LayerNorm(channels)
+        self.context_ln = nn.LayerNorm(channels)
+        self.ff_self = nn.Sequential(
+            nn.LayerNorm(channels),
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels),
+        )
+        self.context_proj = nn.Linear(context_dim, channels)
+    def forward(self, x, context):
+        # Reshape and permute x for multi-head attention
+        batch_size, channels, height, width = x.size()
+        x = x.view(-1, self.channels, self.size * self.size).swapaxes(1,2)
+        x_ln = self.ln(x)
+        # Expand context to match the sequence length of x
+        context = self.context_proj(context)
+        context = context.unsqueeze(1).expand(-1, x_ln.size(1), -1)
+        context_ln = self.context_ln(context)
+        # Apply cross-attention
+        attention_value, _ = self.mha(x_ln, context_ln, context_ln)
+        attention_value = attention_value + x
+        attention_value = self.ff_self(attention_value) + attention_value
+        # Reshape and permute back to the original format
+        return attention_value.permute(0, 2, 1).view(batch_size, channels, height, width)
+class DoubleConv(nn.Module):
+    def __init__(self, in_channels, out_channels, mid_channels=None, residual=False):
+        super().__init__()
+        self.residual = residual
+        if not mid_channels:
+            mid_channels = out_channels
+        self.double_conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
+            nn.GroupNorm(1, mid_channels),
+            nn.GELU(),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.GroupNorm(1, out_channels),
+        )
+    def forward(self, x):
+        if self.residual:
+            return F.gelu(x + self.double_conv(x))
+        else:
+            return self.double_conv(x)
+class Down(nn.Module):
+    def __init__(self, in_channels, out_channels, emb_dim=256):
+        super().__init__()
+        self.maxpool_conv = nn.Sequential(
+            nn.MaxPool2d(2),
+            DoubleConv(in_channels, in_channels, residual=True),
+            DoubleConv(in_channels, out_channels),
+        )
+        self.emb_layer = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_dim,
+                out_channels
+            ),
+        )
+    def forward(self, x, t):
+        x = self.maxpool_conv(x)
+        emb = self.emb_layer(t)[:, :, None, None].repeat(1, 1, x.shape[-2], x.shape[-1])
+        return x + emb
+class Up(nn.Module):
+    def __init__(self, in_channels, out_channels, emb_dim=256):
+        super().__init__()
+        self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
+        self.conv = nn.Sequential(
+            DoubleConv(in_channels, in_channels, residual=True),
+            DoubleConv(in_channels, out_channels, in_channels // 2),
+        )
+        self.emb_layer = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_dim,
+                out_channels
+            ),
+        )
+    def forward(self, x, skip_x, t):
+        x = self.up(x)
+        x = torch.cat([skip_x, x], dim=1)
+        x = self.conv(x)
+        emb = self.emb_layer(t)[:, :, None, None].repeat(1, 1, x.shape[-2], x.shape[-1])
+        return x + emb
+class Dome_UNet(nn.Module):
+    def __init__(self, c_in=3, c_out=3, time_dim=256, device="mps"):
+        super().__init__()
+        self.device = device
+        self.time_dim = time_dim
+        self.inc = DoubleConv(c_in, 64)
+        self.down1 = Down(64, 128)
+        self.sa1 = SelfAttention(128, 32)
+        self.down2 = Down(128, 256)
+        self.sa2 = SelfAttention(256, 16)
+        self.down3 = Down(256, 256)
+        self.sa3 = SelfAttention(256, 8)
+        self.bot1 = DoubleConv(256, 512)
+        self.bot2 = DoubleConv(512, 512)
+        self.bot3 = DoubleConv(512, 256)
+        self.up1 = Up(512, 128)
+        self.sa4 = SelfAttention(128, 16)
+        self.up2 = Up(256, 64)
+        self.sa5 = SelfAttention(64, 32)
+        self.up3 = Up(128, 64)
+        self.sa6 = SelfAttention(64, 64)
+        self.outc = nn.Conv2d(64, c_out, kernel_size=1)
+    def pos_encoding(self, t, channels):
+        inv_freq = 1.0 / (
+            10000
+            ** (torch.arange(0, channels, 2, device=self.device).float() / channels)
+        )
+        pos_enc_a = torch.sin(t.repeat(1, channels // 2) * inv_freq)
+        pos_enc_b = torch.cos(t.repeat(1, channels // 2) * inv_freq)
+        pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
+        return pos_enc
+    def forward(self, x, t):
+        t = t.unsqueeze(-1).type(torch.float)
+        t = self.pos_encoding(t, self.time_dim)
+        x1 = self.inc(x)
+        x2 = self.down1(x1, t)
+        x2 = self.sa1(x2)
+        x3 = self.down2(x2, t)
+        x3 = self.sa2(x3)
+        x4 = self.down3(x3, t)
+        x4 = self.sa3(x4)
+        x4 = self.bot1(x4)
+        x4 = self.bot2(x4)
+        x4 = self.bot3(x4)
+        x = self.up1(x4, x3, t)
+        x = self.sa4(x)
+        x = self.up2(x, x2, t)
+        x = self.sa5(x)
+        x = self.up3(x, x1, t)
+        x = self.sa6(x)
+        output = self.outc(x)
+        return output
+class UNet_conditional(nn.Module):
+    def __init__(self, c_in=3, c_out=3, time_dim=256, num_classes=None, context_dim=None, device="mps"):
+        super().__init__()
+        if context_dim is None:
+            context_dim = num_classes
+        self.device = device
+        self.time_dim = time_dim
+        self.inc = DoubleConv(c_in, 64)
+        self.down1 = Down(64, 128)
+        self.sa1 = SelfAttention(128, 32)
+        self.xa1 = CrossAttention(128, 32, context_dim)
+        self.down2 = Down(128, 256)
+        self.xa2 = CrossAttention(256, 16, context_dim)
+        self.sa2 = SelfAttention(256, 16)
+        self.down3 = Down(256, 256)
+        self.xa3 = CrossAttention(256, 8, context_dim)
+        self.sa3 = SelfAttention(256, 8)
+        self.bot1 = DoubleConv(256, 512)
+        self.bot2 = DoubleConv(512, 512)
+        self.bot3 = DoubleConv(512, 256)
+        self.up1 = Up(512, 128)
+        self.xa4 = CrossAttention(128, 16, context_dim)
+        self.sa4 = SelfAttention(128, 16)
+        self.up2 = Up(256, 64)
+        self.xa5 = CrossAttention(64, 32, context_dim)
+        self.sa5 = SelfAttention(64, 32)
+        self.up3 = Up(128, 64)
+        self.xa6 = CrossAttention(64, 64, context_dim)
+        self.sa6 = SelfAttention(64, 64)
+        self.outc = nn.Conv2d(64, c_out, kernel_size=1)
+        if num_classes is not None:
+            self.label_emb = nn.Linear(num_classes, time_dim)#Embedding(num_classes, time_dim)
+            self.num_classes = num_classes
+            if context_dim is None:
+                context_dim = num_classes
+            self.context_dim = context_dim
+            self.label_crossattn_emb = nn.Linear(num_classes, context_dim)
+    def pos_encoding(self, t, channels):
+        inv_freq = 1.0 / (
+            10000
+            ** (torch.arange(0, channels, 2, device=self.device).float() / channels)
+        )
+        pos_enc_a = torch.sin(t.repeat(1, channels // 2) * inv_freq)
+        pos_enc_b = torch.cos(t.repeat(1, channels // 2) * inv_freq)
+        pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
+        return pos_enc
+    def forward(self, x, t, y):
+        t = t.unsqueeze(-1).type(torch.float)
+        t = self.pos_encoding(t, self.time_dim)
+        if y is not None:
+            attn_y = y[:,:self.num_classes]
+            attn_y = self.label_crossattn_emb(attn_y)
+            # y = y[:,:self.num_classes]
+            # y = self.label_emb(y)
+            # t += y
+        x1 = self.inc(x)
+        x2 = self.down1(x1, t)
+        x2 = self.xa1(x2, attn_y)
+        #x2 = self.sa1(x2)
+        x3 = self.down2(x2, t)
+        x3 = self.xa2(x3, attn_y)
+        #x3 = self.sa2(x3)
+        x4 = self.down3(x3, t)
+        x4 = self.xa3(x4, attn_y)
+        #x4 = self.sa3(x4)
+        x4 = self.bot1(x4)
+        x4 = self.bot2(x4)
+        x4 = self.bot3(x4)
+        x = self.up1(x4, x3, t)
+        x = self.xa4(x,attn_y)
+        #x = self.sa4(x)
+        x = self.up2(x, x2, t)
+        x = self.xa5(x, attn_y)
+        #x = self.sa5(x)
+        x = self.up3(x, x1, t)
+        x = self.xa6(x, attn_y)
+        #x = self.sa6(x)
+        output = self.outc(x)
+        #output = F.sigmoid(x)
+        return output

infer.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from factories import UNet_conditional
+from wrapper import DiffusionManager, Schedule
+import os
+import re
+import torch
+from bert_vectorize import vectorize_text_with_bert
+import time
+import torchvision
+from logger import save_grid_with_label
+EXPERIMENT_DIRECTORY = "runs/run_3_jxa"
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+try:
+    os.mkdir(os.path.join(EXPERIMENT_DIRECTORY, "inferred"))
+except:
+    print("Skipping making directory, directory already exists")
+net = UNet_conditional(num_classes=768)
+net.to(device)
+net.load_state_dict(torch.load(os.path.join(EXPERIMENT_DIRECTORY, "ckpt/latest.pt"),weights_only=True))
+wrapper = DiffusionManager(net, device=device, noise_steps=1000)
+wrapper.set_schedule(Schedule.LINEAR)
+def generate_sample_save_images(prompt, amt=1):
+    path = os.path.join(EXPERIMENT_DIRECTORY, "inferred", re.sub(r'[^a-zA-Z\s]', '', prompt).replace(" ", "_")+str(int(time.time()))+".png")
+    vprompt = vectorize_text_with_bert(prompt).unsqueeze(0)
+    generated = wrapper.sample(64, vprompt, amt=amt).detach().cpu()
+    save_grid_with_label(torchvision.utils.make_grid(generated),prompt, path)
+if __name__ == "__main__":
+    generate_sample_save_images(input("Prompt? "), 8)

logger.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch.utils.tensorboard import SummaryWriter
+import matplotlib.pyplot as plt
+writer = None
+def log_data(data, i):
+    for key in data.keys():
+        writer.add_scalar(key, data[key], i)
+def log_img(img, name):
+    writer.add_image(name, img)
+def save_grid_with_label(img_grid, label, out_file):
+    img_grid = img_grid.permute(1, 2, 0).numpy()
+    fig, ax = plt.subplots(figsize=(8, 8))
+    ax.imshow(img_grid)
+    ax.set_title(label, fontsize=20)
+    ax.axis('off')
+    plt.subplots_adjust(top=0.85)
+    plt.savefig(out_file, bbox_inches='tight', pad_inches=0.1)
+    plt.close(fig)
+    plt.close("all")
+def init_logger(dir="runs"):
+    global writer
+    if not writer:
+        writer = SummaryWriter(dir)

pipeline.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# pipeline.py
+import os
+import re
+import time
+import torch
+import torchvision
+from huggingface_hub import HfApi, HfFolder
+from transformers import Pipeline
+from factories import UNet_conditional
+from wrapper import DiffusionManager, Schedule
+from bert_vectorize import vectorize_text_with_bert
+from logger import save_grid_with_label
+class TextToImagePipeline(Pipeline):
+    def __init__(self, model_dir: str = "runs/run_3_jxa", device: str = "cpu"):
+        # Initialize model, diffusion manager, and set up environment
+        self.device = device
+        self.model_dir = model_dir
+        # Create directories if they do not exist
+        os.makedirs(os.path.join(model_dir, "inferred"), exist_ok=True)
+        # Load model
+        self.net = UNet_conditional(num_classes=768)
+        self.net.to(self.device)
+        self.net.load_state_dict(torch.load(os.path.join(model_dir, "ckpt/latest.pt"), weights_only=True))
+        # Set up DiffusionManager
+        self.wrapper = DiffusionManager(self.net, device=self.device, noise_steps=1000)
+        self.wrapper.set_schedule(Schedule.LINEAR)
+    def __call__(self, prompt,num_steps,amt):
+        self.wrapper = DiffusionManager(self.net, device=self.device, noise_steps=num_steps)
+        self.wrapper.set_schedule(Schedule.LINEAR)
+        return self.generate_sample_save_images(prompt, amt)
+    def generate_sample_save_images(self, prompt: str, amt: int = 1):
+        # Prepare the output path
+        output_path = os.path.join(self.model_dir, "inferred",
+            re.sub(r'[^a-zA-Z\s]', '', prompt).replace(" ", "_") + str(int(time.time())) + ".png")
+        # Vectorize the prompt
+        vprompt = vectorize_text_with_bert(prompt).unsqueeze(0)
+        # Generate images
+        generated = self.wrapper.sample(64, vprompt, amt=amt).detach().cpu()
+        # Save images using the provided save function
+        save_grid_with_label(torchvision.utils.make_grid(generated), prompt, output_path)
+        return output_path  # Return the path to the saved image
+# Usage example
+if __name__ == "__main__":
+    device = "mps" if torch.backends.mps.is_available() else "cpu"
+    model_dir = "runs/run_3_jxa"  # Path to your model directory
+    # Create an instance of the pipeline
+    pipeline = TextToImagePipeline(model_dir=model_dir, device=device)
+    # Get user input and generate an image
+    prompt = input("Prompt? ")
+    image_path = pipeline(prompt, amt=8)
+    print(f"Generated image saved at: {image_path}")

predict.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Prediction interface for Cog ⚙️
+# https://cog.run/python
+from cog import BasePredictor, Input, Path
+import os
+from factories import UNet_conditional
+from wrapper import DiffusionManager, Schedule
+import torch
+import re
+from bert_vectorize import vectorize_text_with_bert
+from logger import save_grid_with_label
+import torchvision
+import time
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        # self.model = torch.load("./weights.pth")
+        # Initialize model, diffusion manager, and set up environment
+        device = "cpu"
+        model_dir = "runs/run_3_jxa"
+        self.device = device
+        self.model_dir = model_dir
+        # Create directories if they do not exist
+        os.makedirs(os.path.join(model_dir, "inferred"), exist_ok=True)
+        # Load model
+        self.net = UNet_conditional(num_classes=768,device=device)
+        self.net.to(self.device)
+        self.net.load_state_dict(torch.load(os.path.join(model_dir, "ckpt/latest_cpu.pt"), weights_only=False))
+        # Set up DiffusionManager
+        self.wrapper = DiffusionManager(self.net, device=self.device, noise_steps=1000)
+        self.wrapper.set_schedule(Schedule.LINEAR)
+    def predict(
+        self,
+        prompt: str = Input(description="Text prompt"),
+        amt: int = Input(description="Amt", default=8)
+    ) -> Path:
+        """Run a single prediction on the model"""
+        # processed_input = preprocess(image)
+        # output = self.model(processed_image, scale)
+        # return postprocess(output)
+        # Vectorize the prompt
+        vprompt = vectorize_text_with_bert(prompt).unsqueeze(0)
+        generated = self.wrapper.sample(64, vprompt, amt=amt).detach().cpu()
+        return torchvision.utils.make_grid(generated).cpu().numpy()

runner.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from factories import UNet_conditional
+from wrapper import DiffusionManager, Schedule
+import os
+import re
+import torch
+from bert_vectorize import vectorize_text_with_bert, cleanup
+import time
+import torchvision
+from logger import save_grid_with_label
+from clip_score import select_top_n_images
+from torchinfo import summary
+EXPERIMENT_DIRECTORY = "runs/run_3_jxa_resumed"
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+try:
+    os.mkdir(os.path.join(EXPERIMENT_DIRECTORY, "inferred"))
+except:
+    print("Skipping making directory, directory already exists")
+net = UNet_conditional(num_classes=768)
+net.to(device)
+net.load_state_dict(torch.load(os.path.join(EXPERIMENT_DIRECTORY, "ckpt/latest.pt")))
+def count_parameters(model):
+    return torch.tensor([p.numel() for p in model.parameters() if p.requires_grad]).sum().item()
+print(f"Parameters: {count_parameters(net)}")
+wrapper = DiffusionManager(net, device=device, noise_steps=1000)
+wrapper.set_schedule(Schedule.LINEAR)
+def infer(prompt, amt=1, topn=8):
+    path = os.path.join(EXPERIMENT_DIRECTORY, "inferred", re.sub(r'[^a-zA-Z\s]', '', prompt).replace(" ", "_")+str(int(time.time()))+".png")
+    vprompt = vectorize_text_with_bert(prompt).unsqueeze(0)
+    generated = wrapper.sample(64, vprompt, amt=amt).detach().cpu()
+    generated, _ = select_top_n_images(generated, prompt, n=topn)
+    save_grid_with_label(torchvision.utils.make_grid(generated),prompt + f"({topn} best of {amt})", path)
+def run_jobs():
+    n=8
+    bestof=32
+    print(f"using best {bestof} of {n}")
+    processed_tasks = set()
+    def read_jobs():
+        try:
+            with open("inference_jobs.txt", 'r') as file:
+                tasks = file.readlines()
+            return [task.strip() for task in tasks]
+        except FileNotFoundError:
+            return []
+    tasks = read_jobs()
+    new_tasks = [task for task in tasks if task not in processed_tasks]
+    while new_tasks:
+        if new_tasks:
+            for task in new_tasks:
+                infer(task, n,bestof)
+                processed_tasks.add(task)
+        tasks = read_jobs()
+        new_tasks = [task for task in tasks if task not in processed_tasks]
+    cleanup()
+if __name__ == "__main__":
+    #infer(input("Prompt? "), 8)
+    run_jobs()

runs/run_3_jxa/ckpt/latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd39e8429ea0ace24bb40d4bd404baebb8aae471385987b898a966eb79dcc5f
+size 103503678

runs/run_3_jxa/ckpt/latest_cpu.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e6d31021fe6d0df8d0d8dee730a411648345f13c0d5ae10084efe536d5dc7a2
+size 103505112

wrapper.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import torch
+import torch.nn as nn
+from enum import Enum
+from tqdm import trange
+Schedule = Enum('Schedule', ['LINEAR', 'COSINE'])
+class DiffusionManager(nn.Module):
+    def __init__(self, model: nn.Module, noise_steps=1000, start=0.0001, end=0.02, device="cpu", **kwargs ) -> None:
+        super().__init__(**kwargs)
+        self.model = model
+        self.noise_steps = noise_steps
+        self.start = start
+        self.end = end
+        self.device = device
+        self.schedule = None
+        self.set_schedule()
+        #model.set_parent(self)
+    def _get_schedule(self, schedule_type: Schedule = Schedule.LINEAR):
+        if schedule_type == Schedule.LINEAR:
+            return torch.linspace(self.start, self.end, self.noise_steps)
+        elif schedule_type == Schedule.COSINE:
+            # https://arxiv.org/pdf/2102.09672 page 4
+            #https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+            #line 18
+            def get_alphahat_at(t):
+                def f(t):
+                    s=self.start
+                    return torch.cos((t/self.noise_steps + s)/(1+s) * torch.pi/2) ** 2
+                return f(t)/f(torch.zeros_like(t))
+            t = torch.Tensor(range(self.noise_steps))
+            t = 1-(get_alphahat_at(t + 1)/get_alphahat_at(t))
+            t = torch.minimum(t, torch.ones_like(t) * 0.999) #"In practice, we clip β_t to be no larger than 0.999 to prevent singularities at the end of the diffusion process n"
+            return t
+    def set_schedule(self, schedule: Schedule = Schedule.LINEAR):
+        self.schedule = self._get_schedule(schedule).to(self.device)
+    def get_schedule_at(self, step):
+        beta = self.schedule
+        alpha = 1 - beta
+        alpha_hat = torch.cumprod(alpha, dim=0)
+        return self._unsqueezify(beta.data[step]), self._unsqueezify(alpha.data[step]), self._unsqueezify(alpha_hat.data[step])
+    @staticmethod
+    def _unsqueezify(value):
+        return value.view(-1, 1, 1, 1)#.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+    def noise_image(self, image, step):
+        image = image.to(self.device)
+        beta, alpha, alpha_hat = self.get_schedule_at(step)
+        epsilon = torch.randn_like(image)
+        # print(alpha_hat)
+        # print(alpha_hat.size())
+        # print(image.size())
+        noised_img = torch.sqrt(alpha_hat) * image  + torch.sqrt(1 - alpha_hat) * epsilon
+        return noised_img, epsilon
+    def random_timesteps(self, amt=1):
+        return torch.randint(low=1, high=self.noise_steps, size=(amt,))
+    def sample(self, img_size, condition, amt=5, use_tqdm=True):
+        if tuple(condition.shape)[0] < amt:
+            condition = condition.repeat(amt, 1)
+        self.model.eval()
+        condition = condition.to(self.device)
+        my_trange = lambda x, y, z: trange(x,y, z, leave=False,dynamic_ncols=True)
+        fn = my_trange if use_tqdm else range
+        with torch.no_grad():
+            cur_img = torch.randn((amt, 3, img_size, img_size)).to(self.device)
+            for i in fn(self.noise_steps-1, 0, -1):
+                timestep = torch.ones(amt) * (i)
+                timestep = timestep.to(self.device)
+                predicted_noise = self.model(cur_img, timestep, condition)
+                beta, alpha, alpha_hat = self.get_schedule_at(i)
+                cur_img = (1/torch.sqrt(alpha))*(cur_img - (beta/torch.sqrt(1-alpha_hat))*predicted_noise)
+                if i > 1:
+                    cur_img = cur_img + torch.sqrt(beta)*torch.randn_like(cur_img)
+        self.model.train()
+        return cur_img
+    def sample_multicond(self, img_size, condition, use_tqdm=True):
+        num_conditions = condition.shape[0]
+        amt = num_conditions
+        self.model.eval()
+        condition = condition.to(self.device)
+        my_trange = lambda x, y, z: trange(x, y, z, leave=False, dynamic_ncols=True)
+        fn = my_trange if use_tqdm else range
+        with torch.no_grad():
+            cur_img = torch.randn((amt, 3, img_size, img_size)).to(self.device)
+            for i in fn(self.noise_steps-1, 0, -1):
+                timestep = torch.ones(amt) * i
+                timestep = timestep.to(self.device)
+                predicted_noise = self.model(cur_img, timestep, condition)
+                beta, alpha, alpha_hat = self.get_schedule_at(i)
+                cur_img = (1 / torch.sqrt(alpha)) * (cur_img - (beta / torch.sqrt(1 - alpha_hat)) * predicted_noise)
+                if i > 1:
+                    cur_img = cur_img + torch.sqrt(beta) * torch.randn_like(cur_img)
+        self.model.train()
+        # Return images sampled for each condition
+        return cur_img
+    def training_loop_iteration(self, optimizer, batch, label, criterion):
+        def print_(string):
+            for i in range(10):
+                print(string)
+        batch = batch.to(self.device)
+        #label = label.long() # uncomment for nn.Embedding
+        label = label.to(self.device)
+        timesteps = self.random_timesteps(batch.shape[0]).to(self.device)
+        noisy_batch, real_noise = self.noise_image(batch, timesteps)
+        if torch.isnan(noisy_batch).any() or torch.isnan(real_noise).any():
+            print_("NaNs detected in the noisy batch or real noise")
+        pred_noise = self.model(noisy_batch, timesteps, label)
+        if torch.isnan(pred_noise).any():
+            print_("NaNs detected in the predicted noise")
+        loss = criterion(real_noise, pred_noise)
+        if torch.isnan(loss).any():
+            print_("NaNs detected in the loss")
+        loss.backward()
+        optimizer.step()
+        return loss.item()