Spaces:

multimodalart
/

diffusion

Runtime error

App Files Files Community

multimodalart HF Staff commited on May 11, 2022

Commit

3a72088

1 Parent(s): 1473645

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -13

app.py CHANGED Viewed

@@ -29,11 +29,11 @@ model.load_state_dict(torch.load(cc12m_model, map_location='cpu'))
 model = model.half().cuda().eval().requires_grad_(False)
 clip_model = clip.load(model.clip_model, jit=False, device='cpu')[0]
-def run_all(prompt, steps, n_images, weight):
     import random
     seed = int(random.randint(0, 2147483647))
     target_embed = clip_model.encode_text(clip.tokenize(prompt)).float().cuda()
     def cfg_model_fn(x, t):
         """The CFG wrapper function."""
         n = x.shape[0]
@@ -44,14 +44,41 @@ def run_all(prompt, steps, n_images, weight):
         v_uncond, v_cond = model(x_in, t_in, clip_embed_in).chunk(2, dim=0)
         v = v_uncond + (v_cond - v_uncond) * weight
         return v
     gc.collect()
     torch.cuda.empty_cache()
     torch.manual_seed(seed)
     x = torch.randn([n_images, 3, side_y, side_x], device='cuda')
     t = torch.linspace(1, 0, steps + 1, device='cuda')[:-1]
     step_list = utils.get_spliced_ddpm_cosine_schedule(t)
-    outs = sampling.plms_sample(cfg_model_fn, x, step_list, {})#, callback=display_callback)
     images_out = []
     for i, out in enumerate(outs):
         images_out.append(utils.to_pil_image(out))
@@ -65,15 +92,10 @@ iface = gr.Interface(
     fn=run_all,
     inputs=[
     gr.inputs.Textbox(label="Prompt - try adding increments to your prompt such as 'oil on canvas', 'a painting', 'a book cover'",default="chalk pastel drawing of a dog wearing a funny hat"),
-    gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=250,minimum=1,step=1),
-    gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1,step=1),
-    gr.inputs.Slider(label="Weight", default=5, maximum=15, minimum=0, step=1),
-    #gr.inputs.Checkbox(label="CLIP Guided"),
-    #gr.inputs.Dropdown(label="Flavor",choices=["ginger", "cumin", "holywater", "zynth", "wyvern", "aaron", "moth", "juu", "custom"]),
-    #markdown,
-    #gr.inputs.Dropdown(label="Style",choices=["Default","Balanced","Detailed","Consistent Creativity","Realistic","Smooth","Subtle MSE","Hyper Fast Results"],default="Hyper Fast Results"),
-    #gr.inputs.Radio(label="Width", choices=[32,64,128,256,512],default=512),
-    #gr.inputs.Radio(label="Height", choices=[32,64,128,256,512],default=512),
     ],
     outputs=gallery,
     title="Generate images from text with V-Diffusion CC12M CFG",

 model = model.half().cuda().eval().requires_grad_(False)
 clip_model = clip.load(model.clip_model, jit=False, device='cpu')[0]
+def run_all(prompt, steps, n_images, weight, clip_guided):
     import random
     seed = int(random.randint(0, 2147483647))
     target_embed = clip_model.encode_text(clip.tokenize(prompt)).float().cuda()
+    clip_embed = target_embed.repeat([n, 1])
     def cfg_model_fn(x, t):
         """The CFG wrapper function."""
         n = x.shape[0]
         v_uncond, v_cond = model(x_in, t_in, clip_embed_in).chunk(2, dim=0)
         v = v_uncond + (v_cond - v_uncond) * weight
         return v
+    def make_cond_model_fn(model, cond_fn):
+        def cond_model_fn(x, t, **extra_args):
+            with torch.enable_grad():
+                x = x.detach().requires_grad_()
+                v = model(x, t, **extra_args)
+                alphas, sigmas = utils.t_to_alpha_sigma(t)
+                pred = x * alphas[:, None, None, None] - v * sigmas[:, None, None, None]
+                cond_grad = cond_fn(x, t, pred, **extra_args).detach()
+                v = v.detach() - cond_grad * (sigmas[:, None, None, None] / alphas[:, None, None, None])
+            return v
+        return cond_model_fn
+    def cond_fn(x, t, pred, clip_embed):
+        if min(pred.shape[2:4]) < 256:
+            pred = F.interpolate(pred, scale_factor=2, mode='bilinear', align_corners=False)
+        clip_in = normalize(make_cutouts((pred + 1) / 2))
+        image_embeds = clip_model.encode_image(clip_in).view([args.cutn, x.shape[0], -1])
+        losses = spherical_dist_loss(image_embeds, clip_embed[None])
+        loss = losses.mean(0).sum() * args.clip_guidance_scale
+        grad = -torch.autograd.grad(loss, x)[0]
+        return grad
     gc.collect()
     torch.cuda.empty_cache()
     torch.manual_seed(seed)
     x = torch.randn([n_images, 3, side_y, side_x], device='cuda')
     t = torch.linspace(1, 0, steps + 1, device='cuda')[:-1]
     step_list = utils.get_spliced_ddpm_cosine_schedule(t)
+    if(not clip_guided):
+        outs = sampling.plms_sample(cfg_model_fn, x, step_list, {})#, callback=display_callback)
+    else:
+        extra_args = {'clip_embed': clip_embed}
+        cond_fn_ = cond_fn
+        model_fn = make_cond_model_fn(model, cond_fn_)
+        outs = sampling.plms_sample(model_fn, x, steps, extra_args)
     images_out = []
     for i, out in enumerate(outs):
         images_out.append(utils.to_pil_image(out))
     fn=run_all,
     inputs=[
     gr.inputs.Textbox(label="Prompt - try adding increments to your prompt such as 'oil on canvas', 'a painting', 'a book cover'",default="chalk pastel drawing of a dog wearing a funny hat"),
+    gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=40,maximum=80,minimum=1,step=1),
+    gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1, step=1),
+    gr.inputs.Slider(label="Weight - how closely the image should resemble the prompt", default=5, maximum=15, minimum=0, step=1),
+    gr.inputs.Checkbox(label="CLIP Guided - improves coherence with prompt, makes it slower"),
     ],
     outputs=gallery,
     title="Generate images from text with V-Diffusion CC12M CFG",