Insert-Anything

Runtime error

App Files Files Community

isat commited on Aug 10, 2025

Commit

38264cb

verified ·

1 Parent(s): 7ea451b

Update app.py

Browse files

Files changed (1) hide show

app.py +207 -181

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import sys
 import cv2
@@ -6,48 +8,162 @@ import torch
 import gradio as gr
 from PIL import Image, ImageFilter, ImageDraw
-os.environ["HF_HUB_DISABLE_XET"] = "1"
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-from huggingface_hub import snapshot_download
-from diffusers import FluxFillPipeline, FluxPriorReduxPipeline
-import math
-from utils.utils import get_bbox_from_mask, expand_bbox, pad_to_square, box2squre, crop_back, expand_image_mask
-import os,sys
-os.system("python -m pip install -e segment_anything")
-os.system("python -m pip install -e GroundingDINO")
 sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
 sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
-os.system("wget https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth")
-os.system("wget https://huggingface.co/spaces/mrtlive/segment-anything-model/resolve/main/sam_vit_h_4b8939.pth")
-import torchvision
-from GroundingDINO.groundingdino.util.inference import load_model
-from segment_anything import build_sam, SamPredictor
-import spaces
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
 # GroundingDINO config and checkpoint
 GROUNDING_DINO_CONFIG_PATH = "./GroundingDINO_SwinB.cfg.py"
-GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swinb_cogcoor.pth"
 # Segment-Anything checkpoint
 SAM_ENCODER_VERSION = "vit_h"
-SAM_CHECKPOINT_PATH = "./sam_vit_h_4b8939.pth"
-# Building GroundingDINO inference model
-groundingdino_model  = load_model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH, device="cuda")
-# Building SAM Model and SAM Predictor
 sam = build_sam(checkpoint=SAM_CHECKPOINT_PATH)
 sam.to(device="cuda")
 sam_predictor = SamPredictor(sam)
-def transform_image(image_pil):
     transform = T.Compose(
         [
             T.RandomResize([800], max_size=1333),
@@ -60,80 +176,54 @@ def transform_image(image_pil):
 def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
-    caption = caption.lower()
-    caption = caption.strip()
     if not caption.endswith("."):
         caption = caption + "."
     with torch.no_grad():
         outputs = model(image[None], captions=[caption])
     logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
     # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
     # get phrase
     tokenlizer = model.tokenizer
     tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    scores = []
     for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(
-            logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(
-                pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
         scores.append(logit.max().item())
     return boxes_filt, torch.Tensor(scores), pred_phrases
 def get_mask(image, label):
     global groundingdino_model, sam_predictor
     image_pil = image.convert("RGB")
     transformed_image = transform_image(image_pil)
     boxes_filt, scores, pred_phrases = get_grounding_output(
         groundingdino_model, transformed_image, label
     )
-    size = image_pil.size
-    # process boxes
-    H, W = size[1], size[0]
     for i in range(boxes_filt.size(0)):
         boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
         boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
         boxes_filt[i][2:] += boxes_filt[i][:2]
     boxes_filt = boxes_filt.cpu()
-    # nms
-    nms_idx = torchvision.ops.nms(
-        boxes_filt, scores, 0.8).numpy().tolist()
     boxes_filt = boxes_filt[nms_idx]
-    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
-    image = np.array(image_pil)
-    sam_predictor.set_image(image)
     transformed_boxes = sam_predictor.transform.apply_boxes_torch(
-        boxes_filt, image.shape[:2]).to("cuda")
     masks, _, _ = sam_predictor.predict_torch(
         point_coords=None,
@@ -142,80 +232,34 @@ def get_mask(image, label):
         multimask_output=False,
     )
     result_mask = masks[0][0].cpu().numpy()
-    result_mask = Image.fromarray(result_mask)
-    return result_mask
 def create_highlighted_mask(image_np, mask_np, alpha=0.5, gray_value=128):
     if mask_np.max() <= 1.0:
         mask_np = (mask_np * 255).astype(np.uint8)
     mask_bool = mask_np > 128
     image_float = image_np.astype(np.float32)
-    # 灰色图层
     gray_overlay = np.full_like(image_float, gray_value, dtype=np.float32)
-    # 混合
     result = image_float.copy()
-    result[mask_bool] = (
-        (1 - alpha) * image_float[mask_bool] + alpha * gray_overlay[mask_bool]
-    )
     return result.astype(np.uint8)
-hf_token = os.getenv("HF_TOKEN")
-snapshot_download(repo_id="black-forest-labs/FLUX.1-Fill-dev", local_dir="./FLUX.1-Fill-dev", token=hf_token)
-snapshot_download(repo_id="black-forest-labs/FLUX.1-Redux-dev", local_dir="./FLUX.1-Redux-dev", token=hf_token)
-snapshot_download(repo_id="WensongSong/Insert-Anything", local_dir="./insertanything_model", token=hf_token)
-dtype = torch.bfloat16
-size = (768, 768)
-pipe = FluxFillPipeline.from_pretrained(
-    "./FLUX.1-Fill-dev",
-    torch_dtype=dtype
-).to("cuda")
-pipe.load_lora_weights(
-    "./insertanything_model/20250321_steps5000_pytorch_lora_weights.safetensors"
-)
-redux = FluxPriorReduxPipeline.from_pretrained("./FLUX.1-Redux-dev").to(dtype=dtype).to("cuda")
-###   example  #####
-ref_dir='./examples/ref_image'
-ref_mask_dir='./examples/ref_mask'
-image_dir='./examples/source_image'
-image_mask_dir='./examples/source_mask'
-ref_list=[os.path.join(ref_dir,file) for file in os.listdir(ref_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file ]
-ref_list.sort()
-ref_mask_list=[os.path.join(ref_mask_dir,file) for file in os.listdir(ref_mask_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file]
-ref_mask_list.sort()
-image_list=[os.path.join(image_dir,file) for file in os.listdir(image_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file ]
-image_list.sort()
-image_mask_list=[os.path.join(image_mask_dir,file) for file in os.listdir(image_mask_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file]
-image_mask_list.sort()
-###   example  #####
 @spaces.GPU
 def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt):
     if base_mask_option == "Draw Mask":
         tar_image = base_image["background"]
         tar_mask = base_image["layers"][0]
@@ -250,42 +294,37 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     if tar_mask.sum() == 0:
         raise gr.Error('No mask for the background image.Please check mask button!')
     if ref_mask.sum() == 0:
         raise gr.Error('No mask for the reference image.Please check mask button!')
     ref_box_yyxx = get_bbox_from_mask(ref_mask)
-    ref_mask_3 = np.stack([ref_mask,ref_mask,ref_mask],-1)
-    masked_ref_image = ref_image * ref_mask_3 + np.ones_like(ref_image) * 255 * (1-ref_mask_3)
-    y1,y2,x1,x2 = ref_box_yyxx
-    masked_ref_image = masked_ref_image[y1:y2,x1:x2,:]
-    ref_mask = ref_mask[y1:y2,x1:x2]
     ratio = 1.3
     masked_ref_image, ref_mask = expand_image_mask(masked_ref_image, ref_mask, ratio=ratio)
-    masked_ref_image = pad_to_square(masked_ref_image, pad_value = 255, random = False)
     kernel = np.ones((7, 7), np.uint8)
     iterations = 2
     tar_mask = cv2.dilate(tar_mask, kernel, iterations=iterations)
-    # zome in
     tar_box_yyxx = get_bbox_from_mask(tar_mask)
     tar_box_yyxx = expand_bbox(tar_mask, tar_box_yyxx, ratio=1.2)
-    tar_box_yyxx_crop =  expand_bbox(tar_image, tar_box_yyxx, ratio=2)    #1.2 1.6
-    tar_box_yyxx_crop = box2squre(tar_image, tar_box_yyxx_crop) # crop box
-    y1,y2,x1,x2 = tar_box_yyxx_crop
     old_tar_image = tar_image.copy()
-    tar_image = tar_image[y1:y2,x1:x2,:]
-    tar_mask = tar_mask[y1:y2,x1:x2]
     H1, W1 = tar_image.shape[0], tar_image.shape[1]
-    # zome in
     tar_mask = pad_to_square(tar_mask, pad_value=0)
     tar_mask = cv2.resize(tar_mask, size)
@@ -293,19 +332,15 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     masked_ref_image = cv2.resize(masked_ref_image.astype(np.uint8), size).astype(np.uint8)
     pipe_prior_output = redux(Image.fromarray(masked_ref_image))
     tar_image = pad_to_square(tar_image, pad_value=255)
     H2, W2 = tar_image.shape[0], tar_image.shape[1]
     tar_image = cv2.resize(tar_image, size)
     diptych_ref_tar = np.concatenate([masked_ref_image, tar_image], axis=1)
-    tar_mask = np.stack([tar_mask,tar_mask,tar_mask],-1)
     mask_black = np.ones_like(tar_image) * 0
     mask_diptych = np.concatenate([mask_black, tar_mask], axis=1)
     show_diptych_ref_tar = create_highlighted_mask(diptych_ref_tar, mask_diptych)
     show_diptych_ref_tar = Image.fromarray(show_diptych_ref_tar)
@@ -313,8 +348,6 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     mask_diptych[mask_diptych == 1] = 255
     mask_diptych = Image.fromarray(mask_diptych)
     generator = torch.Generator("cuda").manual_seed(seed)
     edited_image = pipe(
         image=diptych_ref_tar,
@@ -323,27 +356,22 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
         width=mask_diptych.size[0],
         max_sequence_length=512,
         generator=generator,
-        **pipe_prior_output,
     ).images[0]
     width, height = edited_image.size
     left = width // 2
-    right = width
-    top = 0
-    bottom = height
-    edited_image = edited_image.crop((left, top, right, bottom))
     edited_image = np.array(edited_image)
-    edited_image = crop_back(edited_image, old_tar_image, np.array([H1, W1, H2, W2]), np.array(tar_box_yyxx_crop))
     edited_image = Image.fromarray(edited_image)
     if ref_mask_option != "Label to Mask":
         return [show_diptych_ref_tar, edited_image]
     else:
-        return [return_ref_mask, show_diptych_ref_tar, edited_image]
 def update_ui(option):
     if option == "Draw Mask":
@@ -353,8 +381,6 @@ def update_ui(option):
 with gr.Blocks() as demo:
     gr.Markdown("# Insert-Anything")
     gr.Markdown("### Make sure to select the correct mask button!!")
     gr.Markdown("### Click the output image to toggle between Diptych and final results!!")
@@ -362,42 +388,42 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Row():
-                base_image = gr.ImageEditor(label="Background Image", sources="upload", type="pil", brush=gr.Brush(colors=["#FFFFFF"],default_size = 30,color_mode = "fixed"),
-                                    layers = False,
-                                    interactive=True)
-                base_mask = gr.ImageEditor(label="Background Mask", sources="upload", type="pil", layers = False, brush=False, eraser=False)
             with gr.Row():
-                base_mask_option = gr.Radio(["Draw Mask", "Upload with Mask"], label="Background Mask Input Option", value="Upload with Mask")
             with gr.Row():
-                ref_image = gr.ImageEditor(label="Reference Image", sources="upload", type="pil", brush=gr.Brush(colors=["#FFFFFF"],default_size = 30,color_mode = "fixed"),
-                                    layers = False,
-                                    interactive=True)
-                ref_mask = gr.ImageEditor(label="Reference Mask", sources="upload", type="pil", layers = False, brush=False, eraser=False)
             with gr.Row():
-                ref_mask_option = gr.Radio(["Draw Mask", "Upload with Mask", "Label to Mask"], label="Reference Mask Input Option", value="Upload with Mask")
             with gr.Row():
-                text_prompt = gr.Textbox(label="Label", placeholder="Enter the category of the reference object, e.g., car, dress, toy, etc.")
         with gr.Column(scale=1):
             baseline_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", height=695, columns=1)
             with gr.Accordion("Advanced Option", open=True):
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=999999999, step=1, value=666)
                 gr.Markdown("### Guidelines")
                 gr.Markdown(" Users can try using different seeds. For example, seeds like 42 and 123456 may produce different effects.")
                 gr.Markdown(" Draw Mask means manually drawing a mask on the original image.")
                 gr.Markdown(" Upload with Mask means uploading a mask file.")
                 gr.Markdown(" Label to Mask means simply inputting a label to automatically extract the mask and obtain the result.")
     run_local_button = gr.Button(value="Run")
-    # #### example #####
     num_examples = len(image_list)
     for i in range(num_examples):
         with gr.Row():
@@ -413,10 +439,10 @@ with gr.Blocks() as demo:
                 gr.Examples([ref_mask_list[i]], inputs=[ref_mask], examples_per_page=1, label="")
         if i < num_examples - 1:
             gr.HTML("<hr>")
-    # #### example #####
-    run_local_button.click(fn=run_local,
-                            inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt],
-                            outputs=[baseline_gallery]
-                            )
-demo.launch()

+# app.py — storage-safe + HF Hub friendly
 import os
 import sys
 import cv2
 import gradio as gr
 from PIL import Image, ImageFilter, ImageDraw
+# ---------- ENV & THREADS ----------
+# Map a Spaces variable (no underscores allowed) to the real OpenMP var.
+omp_val = os.getenv("OMP-NUM-THREADS") or os.getenv("OMPNUMTHREADS") or "2"
+os.environ["OMP_NUM_THREADS"] = omp_val
+try:
+    torch.set_num_threads(int(omp_val))
+    torch.set_num_interop_threads(1)
+except Exception:
+    pass
+# Send all caches to persistent storage
+os.environ.setdefault("HF_HOME", "/data/.huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data/.huggingface/hub")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.huggingface/transformers")
+os.environ.setdefault("HF_DATASETS_CACHE", "/data/.huggingface/datasets")
+# Disable Xet path, enable fast transfer
+os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+# ---------- HUB IMPORTS ----------
+from huggingface_hub import snapshot_download, hf_hub_download  # noqa: E402
+from diffusers import FluxFillPipeline, FluxPriorReduxPipeline  # noqa: E402
+import math  # noqa: E402
+from utils.utils import (  # noqa: E402
+    get_bbox_from_mask, expand_bbox, pad_to_square, box2squre, crop_back, expand_image_mask
+)
+# Optional editable installs ONLY if import fails (use requirements.txt ideally)
+def _ensure_local_editable(pkg_name, rel_path):
+    try:
+        __import__(pkg_name)
+    except ImportError:
+        os.system(f"python -m pip install -e {rel_path}")
+_ensure_local_editable("segment_anything", "segment_anything")
+_ensure_local_editable("GroundingDINO", "GroundingDINO")
 sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
 sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
+import torchvision  # noqa: E402
+from GroundingDINO.groundingdino.util.inference import load_model  # noqa: E402
+from segment_anything import build_sam, SamPredictor  # noqa: E402
+import spaces  # noqa: E402
+import GroundingDINO.groundingdino.datasets.transforms as T  # noqa: E402
+from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap  # noqa: E402
+# ---------- PATHS ----------
+PERSIST_ROOT = "/data"
+MODELS_DIR = os.path.join(PERSIST_ROOT, "models")
+CKPT_DIR = os.path.join(PERSIST_ROOT, "checkpoints")
+os.makedirs(MODELS_DIR, exist_ok=True)
+os.makedirs(CKPT_DIR, exist_ok=True)
 # GroundingDINO config and checkpoint
 GROUNDING_DINO_CONFIG_PATH = "./GroundingDINO_SwinB.cfg.py"
+GROUNDING_DINO_CHECKPOINT_PATH = os.path.join(CKPT_DIR, "groundingdino_swinb_cogcoor.pth")
 # Segment-Anything checkpoint
 SAM_ENCODER_VERSION = "vit_h"
+SAM_CHECKPOINT_PATH = os.path.join(CKPT_DIR, "sam_vit_h_4b8939.pth")
+# ---------- AUTH TOKEN ----------
+hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+# ---------- DOWNLOAD CHECKPOINTS (single files) ----------
+# GroundingDINO ckpt (single file)
+if not os.path.exists(GROUNDING_DINO_CHECKPOINT_PATH):
+    G_DINO_FILE = hf_hub_download(
+        repo_id="ShilongLiu/GroundingDINO",
+        filename="groundingdino_swinb_cogcoor.pth",
+        local_dir=CKPT_DIR,
+        token=hf_token,
+    )
+    if G_DINO_FILE != GROUNDING_DINO_CHECKPOINT_PATH:
+        # Ensure the expected path exists for later code
+        os.replace(G_DINO_FILE, GROUNDING_DINO_CHECKPOINT_PATH)
+# SAM ckpt (single file)
+if not os.path.exists(SAM_CHECKPOINT_PATH):
+    SAM_FILE = hf_hub_download(
+        repo_id="spaces/mrtlive/segment-anything-model",
+        filename="sam_vit_h_4b8939.pth",
+        local_dir=CKPT_DIR,
+        token=hf_token,
+    )
+    if SAM_FILE != SAM_CHECKPOINT_PATH:
+        os.replace(SAM_FILE, SAM_CHECKPOINT_PATH)
+# ---------- DOWNLOAD MODELS (filtered snapshots into /data) ----------
+FILL_DIR = os.path.join(MODELS_DIR, "FLUX.1-Fill-dev")
+REDUX_DIR = os.path.join(MODELS_DIR, "FLUX.1-Redux-dev")
+LORA_DIR = os.path.join(MODELS_DIR, "insertanything_model")
+for path in (FILL_DIR, REDUX_DIR, LORA_DIR):
+    os.makedirs(path, exist_ok=True)
+# Only pull what we need (weights/configs). Keep symlinks to avoid copies.
+if not os.listdir(FILL_DIR):
+    snapshot_download(
+        repo_id="black-forest-labs/FLUX.1-Fill-dev",
+        local_dir=FILL_DIR,
+        local_dir_use_symlinks=True,
+        allow_patterns=["*.safetensors", "*.json", "*.yaml", "*.txt", "*.py", "*.model"],
+        token=hf_token,
+    )
+if not os.listdir(REDUX_DIR):
+    snapshot_download(
+        repo_id="black-forest-labs/FLUX.1-Redux-dev",
+        local_dir=REDUX_DIR,
+        local_dir_use_symlinks=True,
+        allow_patterns=["*.safetensors", "*.json", "*.yaml", "*.txt", "*.py", "*.model"],
+        token=hf_token,
+    )
+if not os.listdir(LORA_DIR):
+    snapshot_download(
+        repo_id="WensongSong/Insert-Anything",
+        local_dir=LORA_DIR,
+        local_dir_use_symlinks=True,
+        allow_patterns=["*.safetensors", "*.json", "*.yaml", "*.txt"],
+        token=hf_token,
+    )
+# ---------- BUILD MODELS ----------
+# GroundingDINO
+groundingdino_model = load_model(
+    model_config_path=GROUNDING_DINO_CONFIG_PATH,
+    model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH,
+    device="cuda"
+)
+# SAM + Predictor
 sam = build_sam(checkpoint=SAM_CHECKPOINT_PATH)
 sam.to(device="cuda")
 sam_predictor = SamPredictor(sam)
+# Diffusers
+dtype = torch.bfloat16
+size = (768, 768)
+pipe = FluxFillPipeline.from_pretrained(
+    FILL_DIR,
+    torch_dtype=dtype
+).to("cuda")
+pipe.load_lora_weights(
+    os.path.join(LORA_DIR, "20250321_steps5000_pytorch_lora_weights.safetensors")
+)
+redux = FluxPriorReduxPipeline.from_pretrained(REDUX_DIR).to(dtype=dtype).to("cuda")
+# ---------- APP LOGIC ----------
+def transform_image(image_pil):
     transform = T.Compose(
         [
             T.RandomResize([800], max_size=1333),
 def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
+    caption = caption.lower().strip()
     if not caption.endswith("."):
         caption = caption + "."
     with torch.no_grad():
         outputs = model(image[None], captions=[caption])
     logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]              # (nq, 4)
     # filter output
+    filt_mask = logits.max(dim=1)[0] > box_threshold
+    logits_filt = logits[filt_mask]
+    boxes_filt = boxes[filt_mask]
     # get phrase
     tokenlizer = model.tokenizer
     tokenized = tokenlizer(caption)
+    pred_phrases, scores = [], []
     for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})" if with_logits else pred_phrase)
         scores.append(logit.max().item())
     return boxes_filt, torch.Tensor(scores), pred_phrases
 def get_mask(image, label):
     global groundingdino_model, sam_predictor
     image_pil = image.convert("RGB")
     transformed_image = transform_image(image_pil)
     boxes_filt, scores, pred_phrases = get_grounding_output(
         groundingdino_model, transformed_image, label
     )
+    W, H = image_pil.size
     for i in range(boxes_filt.size(0)):
         boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
         boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
         boxes_filt[i][2:] += boxes_filt[i][:2]
     boxes_filt = boxes_filt.cpu()
+    nms_idx = torchvision.ops.nms(boxes_filt, scores, 0.8).numpy().tolist()
     boxes_filt = boxes_filt[nms_idx]
+    image_np = np.array(image_pil)
+    sam_predictor.set_image(image_np)
     transformed_boxes = sam_predictor.transform.apply_boxes_torch(
+        boxes_filt, image_np.shape[:2]
+    ).to("cuda")
     masks, _, _ = sam_predictor.predict_torch(
         point_coords=None,
         multimask_output=False,
     )
     result_mask = masks[0][0].cpu().numpy()
+    return Image.fromarray(result_mask)
 def create_highlighted_mask(image_np, mask_np, alpha=0.5, gray_value=128):
     if mask_np.max() <= 1.0:
         mask_np = (mask_np * 255).astype(np.uint8)
     mask_bool = mask_np > 128
     image_float = image_np.astype(np.float32)
     gray_overlay = np.full_like(image_float, gray_value, dtype=np.float32)
     result = image_float.copy()
+    result[mask_bool] = (1 - alpha) * image_float[mask_bool] + alpha * gray_overlay[mask_bool]
     return result.astype(np.uint8)
+# ---------- EXAMPLES ----------
+ref_dir = './examples/ref_image'
+ref_mask_dir = './examples/ref_mask'
+image_dir = './examples/source_image'
+image_mask_dir = './examples/source_mask'
+ref_list = sorted([os.path.join(ref_dir, f) for f in os.listdir(ref_dir) if f.lower().endswith((".jpg", ".png", ".jpeg"))])
+ref_mask_list = sorted([os.path.join(ref_mask_dir, f) for f in os.listdir(ref_mask_dir) if f.lower().endswith((".jpg", ".png", ".jpeg"))])
+image_list = sorted([os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".png", ".jpeg"))])
+image_mask_list = sorted([os.path.join(image_mask_dir, f) for f in os.listdir(image_mask_dir) if f.lower().endswith((".jpg", ".png", ".jpeg"))])
 @spaces.GPU
 def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt):
     if base_mask_option == "Draw Mask":
         tar_image = base_image["background"]
         tar_mask = base_image["layers"][0]
     if tar_mask.sum() == 0:
         raise gr.Error('No mask for the background image.Please check mask button!')
     if ref_mask.sum() == 0:
         raise gr.Error('No mask for the reference image.Please check mask button!')
     ref_box_yyxx = get_bbox_from_mask(ref_mask)
+    ref_mask_3 = np.stack([ref_mask, ref_mask, ref_mask], -1)
+    masked_ref_image = ref_image * ref_mask_3 + np.ones_like(ref_image) * 255 * (1 - ref_mask_3)
+    y1, y2, x1, x2 = ref_box_yyxx
+    masked_ref_image = masked_ref_image[y1:y2, x1:x2, :]
+    ref_mask = ref_mask[y1:y2, x1:x2]
     ratio = 1.3
     masked_ref_image, ref_mask = expand_image_mask(masked_ref_image, ref_mask, ratio=ratio)
+    masked_ref_image = pad_to_square(masked_ref_image, pad_value=255, random=False)
     kernel = np.ones((7, 7), np.uint8)
     iterations = 2
     tar_mask = cv2.dilate(tar_mask, kernel, iterations=iterations)
+    # zoom in
     tar_box_yyxx = get_bbox_from_mask(tar_mask)
     tar_box_yyxx = expand_bbox(tar_mask, tar_box_yyxx, ratio=1.2)
+    tar_box_yyxx_crop = expand_bbox(tar_image, tar_box_yyxx, ratio=2)
+    tar_box_yyxx_crop = box2squre(tar_image, tar_box_yyxx_crop)  # crop box
+    y1, y2, x1, x2 = tar_box_yyxx_crop
     old_tar_image = tar_image.copy()
+    tar_image = tar_image[y1:y2, x1:x2, :]
+    tar_mask = tar_mask[y1:y2, x1:x2]
     H1, W1 = tar_image.shape[0], tar_image.shape[1]
     tar_mask = pad_to_square(tar_mask, pad_value=0)
     tar_mask = cv2.resize(tar_mask, size)
     masked_ref_image = cv2.resize(masked_ref_image.astype(np.uint8), size).astype(np.uint8)
     pipe_prior_output = redux(Image.fromarray(masked_ref_image))
     tar_image = pad_to_square(tar_image, pad_value=255)
     H2, W2 = tar_image.shape[0], tar_image.shape[1]
     tar_image = cv2.resize(tar_image, size)
     diptych_ref_tar = np.concatenate([masked_ref_image, tar_image], axis=1)
+    tar_mask = np.stack([tar_mask, tar_mask, tar_mask], -1)
     mask_black = np.ones_like(tar_image) * 0
     mask_diptych = np.concatenate([mask_black, tar_mask], axis=1)
     show_diptych_ref_tar = create_highlighted_mask(diptych_ref_tar, mask_diptych)
     show_diptych_ref_tar = Image.fromarray(show_diptych_ref_tar)
     mask_diptych[mask_diptych == 1] = 255
     mask_diptych = Image.fromarray(mask_diptych)
     generator = torch.Generator("cuda").manual_seed(seed)
     edited_image = pipe(
         image=diptych_ref_tar,
         width=mask_diptych.size[0],
         max_sequence_length=512,
         generator=generator,
+        **pipe_prior_output,
     ).images[0]
     width, height = edited_image.size
     left = width // 2
+    edited_image = edited_image.crop((left, 0, width, height))
     edited_image = np.array(edited_image)
+    edited_image = crop_back(edited_image, old_tar_image, np.array([H1, W1, H2, W2]), np.array(tar_box_yyxx_crop))
     edited_image = Image.fromarray(edited_image)
     if ref_mask_option != "Label to Mask":
         return [show_diptych_ref_tar, edited_image]
     else:
+        return [return_ref_mask, show_diptych_ref_tar, edited_image]
 def update_ui(option):
     if option == "Draw Mask":
 with gr.Blocks() as demo:
     gr.Markdown("# Insert-Anything")
     gr.Markdown("### Make sure to select the correct mask button!!")
     gr.Markdown("### Click the output image to toggle between Diptych and final results!!")
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Row():
+                base_image = gr.ImageEditor(label="Background Image", sources="upload", type="pil",
+                                            brush=gr.Brush(colors=["#FFFFFF"], default_size=30, color_mode="fixed"),
+                                            layers=False, interactive=True)
+                base_mask = gr.ImageEditor(label="Background Mask", sources="upload", type="pil",
+                                           layers=False, brush=False, eraser=False)
             with gr.Row():
+                base_mask_option = gr.Radio(["Draw Mask", "Upload with Mask"], label="Background Mask Input Option",
+                                            value="Upload with Mask")
             with gr.Row():
+                ref_image = gr.ImageEditor(label="Reference Image", sources="upload", type="pil",
+                                           brush=gr.Brush(colors=["#FFFFFF"], default_size=30, color_mode="fixed"),
+                                           layers=False, interactive=True)
+                ref_mask = gr.ImageEditor(label="Reference Mask", sources="upload", type="pil",
+                                          layers=False, brush=False, eraser=False)
             with gr.Row():
+                ref_mask_option = gr.Radio(["Draw Mask", "Upload with Mask", "Label to Mask"],
+                                           label="Reference Mask Input Option", value="Upload with Mask")
             with gr.Row():
+                text_prompt = gr.Textbox(label="Label",
+                                         placeholder="Enter the category of the reference object, e.g., car, dress, toy, etc.")
         with gr.Column(scale=1):
             baseline_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", height=695, columns=1)
             with gr.Accordion("Advanced Option", open=True):
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=999_999_999, step=1, value=666)
                 gr.Markdown("### Guidelines")
                 gr.Markdown(" Users can try using different seeds. For example, seeds like 42 and 123456 may produce different effects.")
                 gr.Markdown(" Draw Mask means manually drawing a mask on the original image.")
                 gr.Markdown(" Upload with Mask means uploading a mask file.")
                 gr.Markdown(" Label to Mask means simply inputting a label to automatically extract the mask and obtain the result.")
     run_local_button = gr.Button(value="Run")
+    # examples
     num_examples = len(image_list)
     for i in range(num_examples):
         with gr.Row():
                 gr.Examples([ref_mask_list[i]], inputs=[ref_mask], examples_per_page=1, label="")
         if i < num_examples - 1:
             gr.HTML("<hr>")
+    run_local_button.click(
+        fn=run_local,
+        inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt],
+        outputs=[baseline_gallery]
+    )
+demo.launch()