Spaces:

jixin0101
/

ObjectClear

Running on Zero

App Files Files Community

srkanth

by srikanthsri - opened Aug 19, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+38

-56

Files changed (4) hide show

app.py +1 -1
model.py +1 -0
pipeline_objectclear.py +35 -54
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
-import spaces
 import os
 from PIL import Image
 import torch
@@ -11,6 +10,7 @@ import argparse
 import numpy as np
 import torchvision.transforms.functional as TF
 from scipy.ndimage import convolve, zoom
 from utils import resize_by_short_side
 from tools.interact_tools import SamControler

 import gradio as gr
 import os
 from PIL import Image
 import torch
 import numpy as np
 import torchvision.transforms.functional as TF
 from scipy.ndimage import convolve, zoom
+import spaces
 from utils import resize_by_short_side
 from tools.interact_tools import SamControler

model.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.transforms as T
 from transformers.models.clip.modeling_clip import (
     CLIPPreTrainedModel,
     CLIPModel,
 )

 import torch.nn.functional as F
 import torchvision.transforms as T
 from transformers.models.clip.modeling_clip import (
+    CLIPTextTransformer,
     CLIPPreTrainedModel,
     CLIPModel,
 )

pipeline_objectclear.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-import os
 import numpy as np
 import PIL.Image
@@ -335,7 +334,6 @@ def retrieve_timesteps(
 class ObjectClearPipelineOutput(StableDiffusionXLPipelineOutput):
     attns: Optional[List[PIL.Image.Image]] = None
 class ObjectClearPipeline(
     DiffusionPipeline,
     StableDiffusionMixin,
@@ -430,7 +428,7 @@ class ObjectClearPipeline(
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
-        apply_attention_guided_fusion: bool = True,
     ):
         super().__init__()
@@ -465,7 +463,9 @@ class ObjectClearPipeline(
         if self.config.apply_attention_guided_fusion:
             self.cross_attention_scores = {}
-            self.original_state = None
     @classmethod
@@ -486,17 +486,14 @@ class ObjectClearPipeline(
         )
         postfuse_module = PostfuseModule(embed_dim=2048, embed_dim_img=768)
-        sub_folder = "postfuse_module"
         filename = "model.safetensors"
-        if pretrained_model_name_or_path == "jixin0101/ObjectClear":
-            safetensor_path = hf_hub_download(
-                repo_id="jixin0101/ObjectClear",
-                filename=filename,
-                subfolder="postfuse_module",
-                cache_dir=cache_dir
-            )
-        else:
-            safetensor_path = os.path.join(pretrained_model_name_or_path, sub_folder, filename)
         state_dict_postfuse = load_file(safetensor_path)
         postfuse_module.load_state_dict(state_dict_postfuse)
@@ -540,7 +537,7 @@ class ObjectClearPipeline(
             return image_embeds, uncond_image_embeds
-    def unet_store_cross_attention_scores(self, unet, attention_scores, applicable_layers=None):
         from diffusers.models.attention_processor import (
             Attention,
             AttnProcessor,
@@ -548,25 +545,34 @@ class ObjectClearPipeline(
         )
         import types
-        TARGET_LAYER = "down_blocks.1.attentions.0.transformer_blocks.0.attn2"
-        original_state = {}
         def make_new_get_attention_scores_fn(name):
             def new_get_attention_scores(module, query, key, attention_mask=None):
                 attention_probs = module.old_get_attention_scores(
                     query, key, attention_mask
                 )
-                if name == TARGET_LAYER:
-                    attention_scores[name] = attention_probs
                 return attention_probs
             return new_get_attention_scores
         for name, module in unet.named_modules():
-            if isinstance(module, Attention) and name == TARGET_LAYER and "attn2" in name:
-                original_state[name] = {
-                    "processor": module.processor,
-                    "get_attention_scores": module.get_attention_scores
-                }
                 if isinstance(module.processor, AttnProcessor2_0):
                     module.set_processor(AttnProcessor())
                 module.old_get_attention_scores = module.get_attention_scores
@@ -575,19 +581,6 @@ class ObjectClearPipeline(
                 )
                 module.get_attention_scores = module.new_get_attention_scores
-        return unet, original_state
-    def unet_restore_attention_processor(self, unet, original_state):
-        from diffusers.models.attention_processor import Attention
-        for name, module in unet.named_modules():
-            if isinstance(module, Attention) and "attn2" in name and name in original_state:
-                module.get_attention_scores = original_state[name]["get_attention_scores"]
-                module.set_processor(original_state[name]["processor"])
-                if hasattr(module, "old_get_attention_scores"):
-                    delattr(module, "old_get_attention_scores")
-                if hasattr(module, "new_get_attention_scores"):
-                    delattr(module, "new_get_attention_scores")
         return unet
     def resize_attn_map_divide2(self, attn_map, mask, fuse_index):
@@ -1433,7 +1426,7 @@ class ObjectClearPipeline(
                 on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
                 resizing to the original image size for inpainting. This is useful when the masked area is small while
                 the image is large and contain information irrelevant for inpainting, such as background.
-            strength (`float`, *optional*, defaults to 0.9999):
                 Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
                 between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
                 `strength`. The number of denoising steps depends on the amount of noise initially added. When
@@ -1878,12 +1871,6 @@ class ObjectClearPipeline(
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
-                # Inject cross-attention storage logic at the last timestep
-                if i == len(timesteps) - 1 and self.config.apply_attention_guided_fusion:
-                    self.unet, self.original_state = self.unet_store_cross_attention_scores(
-                        self.unet,
-                        self.cross_attention_scores
-                    )
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
@@ -1937,8 +1924,8 @@ class ObjectClearPipeline(
                         )
                         latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-                    if i == len(timesteps) - 1 and self.config.apply_attention_guided_fusion:
                         attn_key, attn_map = next(iter(self.cross_attention_scores.items()))
                         attn_map = self.resize_attn_map_divide2(attn_map, mask, fuse_index)
                         init_latents_proper = image_latents
@@ -1947,13 +1934,7 @@ class ObjectClearPipeline(
                         else:
                             init_mask = attn_map
                         attn_map = init_mask
-                        self.unet = self.unet_restore_attention_processor(
-                            self.unet,
-                            self.original_state
-                        )
-                        self.clear_cross_attention_scores(self.cross_attention_scores)
                 if num_channels_unet == 4:
                     init_latents_proper = image_latents
@@ -2076,4 +2057,4 @@ class ObjectClearPipeline(
         else:
             if not return_dict:
                 return (image,)
-            return ObjectClearPipelineOutput(images=image)

 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
 class ObjectClearPipelineOutput(StableDiffusionXLPipelineOutput):
     attns: Optional[List[PIL.Image.Image]] = None
 class ObjectClearPipeline(
     DiffusionPipeline,
     StableDiffusionMixin,
         requires_aesthetics_score: bool = False,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
+        apply_attention_guided_fusion: bool = False,
     ):
         super().__init__()
         if self.config.apply_attention_guided_fusion:
             self.cross_attention_scores = {}
+            self.unet = self.unet_store_cross_attention_scores(
+                self.unet, self.cross_attention_scores
+            )
     @classmethod
         )
         postfuse_module = PostfuseModule(embed_dim=2048, embed_dim_img=768)
         filename = "model.safetensors"
+        safetensor_path = hf_hub_download(
+            repo_id="jixin0101/ObjectClear",
+            filename=filename,
+            subfolder="postfuse_module",
+            cache_dir=cache_dir
+        )
         state_dict_postfuse = load_file(safetensor_path)
         postfuse_module.load_state_dict(state_dict_postfuse)
             return image_embeds, uncond_image_embeds
+    def unet_store_cross_attention_scores(self, unet, attention_scores):
         from diffusers.models.attention_processor import (
             Attention,
             AttnProcessor,
         )
         import types
+        UNET_LAYER_NAMES = [
+            "down_blocks.0",
+            "down_blocks.1",
+            "down_blocks.2",
+            "mid_block",
+            "up_blocks.1",
+            "up_blocks.2",
+            "up_blocks.3",
+        ]
+        start_layer = 0
+        end_layer = 2
+        applicable_layers = UNET_LAYER_NAMES[start_layer:end_layer]
         def make_new_get_attention_scores_fn(name):
             def new_get_attention_scores(module, query, key, attention_mask=None):
                 attention_probs = module.old_get_attention_scores(
                     query, key, attention_mask
                 )
+                attention_scores[name] = attention_probs
                 return attention_probs
             return new_get_attention_scores
         for name, module in unet.named_modules():
+            if isinstance(module, Attention) and "attn2" in name:
+                if not any(layer in name for layer in applicable_layers):
+                    continue
                 if isinstance(module.processor, AttnProcessor2_0):
                     module.set_processor(AttnProcessor())
                 module.old_get_attention_scores = module.get_attention_scores
                 )
                 module.get_attention_scores = module.new_get_attention_scores
         return unet
     def resize_attn_map_divide2(self, attn_map, mask, fuse_index):
                 on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
                 resizing to the original image size for inpainting. This is useful when the masked area is small while
                 the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
                 Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
                 between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
                 `strength`. The number of denoising steps depends on the amount of noise initially added. When
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                         )
                         latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+                    if i == len(timesteps) - 1:
                         attn_key, attn_map = next(iter(self.cross_attention_scores.items()))
                         attn_map = self.resize_attn_map_divide2(attn_map, mask, fuse_index)
                         init_latents_proper = image_latents
                         else:
                             init_mask = attn_map
                         attn_map = init_mask
+                    self.clear_cross_attention_scores(self.cross_attention_scores)
                 if num_channels_unet == 4:
                     init_latents_proper = image_latents
         else:
             if not return_dict:
                 return (image,)
+            return ObjectClearPipelineOutput(images=image)

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 accelerate
-torch==2.8.0
 torchvision
 numpy==1.26.4
 opencv-python

 accelerate
+torch==2.2.0
 torchvision
 numpy==1.26.4
 opencv-python