Spaces:

editing-images
/

leditsplusplus

Running on Zero

App Files Files Community

[Admin maintenance] Migrate grant to ZeroGPU

#21

by multimodalart HF Staff - opened May 25

base: refs/heads/main

←

from: refs/pr/21

Discussion Files changed

+60

-9

Files changed (1) hide show

app.py +60 -9

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -26,7 +27,47 @@ pipe.scheduler = DPMSolverMultistepSchedulerInject.from_pretrained(sd_model_id,
 blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base",torch_dtype=torch.float16).to(device)
 ## IMAGE CPATIONING ##
 def caption_image(input_image):
     inputs = blip_processor(images=input_image, return_tensors="pt").to(device, torch.float16)
     pixel_values = inputs.pixel_values
@@ -51,6 +92,7 @@ def sample(zs, wts, attention_store, text_cross_attention_maps, prompt_tar="", c
     return img.images[0], attention_store, text_cross_attention_maps
 def reconstruct(
     tar_prompt,
     image_caption,
@@ -64,6 +106,10 @@ def reconstruct(
     reconstruction,
     reconstruct_button,
 ):
     if reconstruct_button == "Hide Reconstruction":
         return (
             reconstruction,
@@ -130,6 +176,7 @@ def load_and_invert(
 ## SEGA ##
 def edit(input_image,
             wts, zs, attention_store, text_cross_attention_maps,
             tar_prompt,
@@ -143,15 +190,19 @@ def edit(input_image,
             neg_guidance_1, neg_guidance_2, neg_guidance_3,
             threshold_1, threshold_2, threshold_3,
             do_reconstruction,
-            reconstruction,
             # for inversion in case it needs to be re computed (and avoid delay):
             do_inversion,
-            seed,
             randomize_seed,
             src_prompt,
             src_cfg_scale,
             mask_type,
             progress=gr.Progress(track_tqdm=True)):
     show_share_button = gr.update(visible=True)
     if(mask_type == "No mask"):
         use_cross_attn_mask = False
@@ -207,18 +258,18 @@ def edit(input_image,
                           # wts=wts.value,
                           zs=zs, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps, **editing_args)
-      return sega_out.images[0], gr.update(visible=True), do_reconstruction, reconstruction, wts, zs, attention_store, text_cross_attention_maps, do_inversion, show_share_button
     else: # if sega concepts were not added, performs regular ddpm sampling
       if do_reconstruction: # if ddpm sampling wasn't computed
           pure_ddpm_img, attention_store, text_cross_attention_maps = sample(zs, wts, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps, prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=tar_cfg_scale)
           reconstruction = pure_ddpm_img
           do_reconstruction = False
-          return pure_ddpm_img, gr.update(visible=False), do_reconstruction, reconstruction, wts, zs, attention_store, text_cross_attention_maps, do_inversion, show_share_button
-      return reconstruction, gr.update(visible=False), do_reconstruction, reconstruction, wts, zs, attention_store, text_cross_attention_maps, do_inversion, show_share_button
 def randomize_seed_fn(seed, is_random):

+import spaces
 import gradio as gr
 import torch
 import numpy as np
 blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base",torch_dtype=torch.float16).to(device)
+## Helpers to bounce CUDA tensors across gr.State (multiprocessing pickle barrier on ZeroGPU)
+def _to_cpu(obj, _seen=None):
+    if _seen is None:
+        _seen = set()
+    if isinstance(obj, torch.Tensor):
+        return obj.detach().cpu() if obj.is_cuda else obj
+    if isinstance(obj, list):
+        return [_to_cpu(x, _seen) for x in obj]
+    if isinstance(obj, tuple):
+        return tuple(_to_cpu(x, _seen) for x in obj)
+    if isinstance(obj, dict):
+        return {k: _to_cpu(v, _seen) for k, v in obj.items()}
+    if hasattr(obj, "__dict__") and id(obj) not in _seen:
+        _seen.add(id(obj))
+        for k, v in list(obj.__dict__.items()):
+            setattr(obj, k, _to_cpu(v, _seen))
+        return obj
+    return obj
+def _to_cuda(obj, _seen=None):
+    if _seen is None:
+        _seen = set()
+    if isinstance(obj, torch.Tensor):
+        return obj.to(device) if not obj.is_cuda else obj
+    if isinstance(obj, list):
+        return [_to_cuda(x, _seen) for x in obj]
+    if isinstance(obj, tuple):
+        return tuple(_to_cuda(x, _seen) for x in obj)
+    if isinstance(obj, dict):
+        return {k: _to_cuda(v, _seen) for k, v in obj.items()}
+    if hasattr(obj, "__dict__") and id(obj) not in _seen:
+        _seen.add(id(obj))
+        for k, v in list(obj.__dict__.items()):
+            setattr(obj, k, _to_cuda(v, _seen))
+        return obj
+    return obj
 ## IMAGE CPATIONING ##
+@spaces.GPU
 def caption_image(input_image):
     inputs = blip_processor(images=input_image, return_tensors="pt").to(device, torch.float16)
     pixel_values = inputs.pixel_values
     return img.images[0], attention_store, text_cross_attention_maps
+@spaces.GPU
 def reconstruct(
     tar_prompt,
     image_caption,
     reconstruction,
     reconstruct_button,
 ):
+    wts = _to_cuda(wts)
+    zs = _to_cuda(zs)
+    attention_store = _to_cuda(attention_store)
+    text_cross_attention_maps = _to_cuda(text_cross_attention_maps)
     if reconstruct_button == "Hide Reconstruction":
         return (
             reconstruction,
 ## SEGA ##
+@spaces.GPU
 def edit(input_image,
             wts, zs, attention_store, text_cross_attention_maps,
             tar_prompt,
             neg_guidance_1, neg_guidance_2, neg_guidance_3,
             threshold_1, threshold_2, threshold_3,
             do_reconstruction,
+            reconstruction,
             # for inversion in case it needs to be re computed (and avoid delay):
             do_inversion,
+            seed,
             randomize_seed,
             src_prompt,
             src_cfg_scale,
             mask_type,
             progress=gr.Progress(track_tqdm=True)):
+    wts = _to_cuda(wts)
+    zs = _to_cuda(zs)
+    attention_store = _to_cuda(attention_store)
+    text_cross_attention_maps = _to_cuda(text_cross_attention_maps)
     show_share_button = gr.update(visible=True)
     if(mask_type == "No mask"):
         use_cross_attn_mask = False
                           # wts=wts.value,
                           zs=zs, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps, **editing_args)
+      return sega_out.images[0], gr.update(visible=True), do_reconstruction, reconstruction, _to_cpu(wts), _to_cpu(zs), _to_cpu(attention_store), _to_cpu(text_cross_attention_maps), do_inversion, show_share_button
     else: # if sega concepts were not added, performs regular ddpm sampling
       if do_reconstruction: # if ddpm sampling wasn't computed
           pure_ddpm_img, attention_store, text_cross_attention_maps = sample(zs, wts, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps, prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=tar_cfg_scale)
           reconstruction = pure_ddpm_img
           do_reconstruction = False
+          return pure_ddpm_img, gr.update(visible=False), do_reconstruction, reconstruction, _to_cpu(wts), _to_cpu(zs), _to_cpu(attention_store), _to_cpu(text_cross_attention_maps), do_inversion, show_share_button
+      return reconstruction, gr.update(visible=False), do_reconstruction, reconstruction, _to_cpu(wts), _to_cpu(zs), _to_cpu(attention_store), _to_cpu(text_cross_attention_maps), do_inversion, show_share_button
 def randomize_seed_fn(seed, is_random):