Spaces:

IDEA-Research
/

Grounded-SAM

Runtime error

App Files Files Community

Update app.py

#13

by XtewaldX - opened Mar 13

base: refs/heads/main

←

from: refs/pr/13

Discussion Files changed

+185

-315

Files changed (1) hide show

app.py +185 -315

app.py CHANGED Viewed

@@ -1,337 +1,207 @@
-import os, sys
-import random
-import warnings
-os.system("python -m pip install -e segment_anything")
-os.system("python -m pip install -e GroundingDINO")
-os.system("pip install --upgrade diffusers[torch]")
-os.system("pip install opencv-python pycocotools matplotlib onnxruntime onnx ipykernel")
-os.system("wget https://github.com/IDEA-Research/Grounded-Segment-Anything/raw/main/assets/demo1.jpg")
-os.system("wget https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth")
-os.system("wget https://huggingface.co/spaces/mrtlive/segment-anything-model/resolve/main/sam_vit_h_4b8939.pth")
-sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
-sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
-warnings.filterwarnings("ignore")
 import gradio as gr
-import argparse
 import numpy as np
-import torch
-import torchvision
-from PIL import Image, ImageDraw, ImageFont
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import numpy as np
-# diffusers
-import torch
-from diffusers import StableDiffusionInpaintPipeline
-# BLIP
-from transformers import BlipProcessor, BlipForConditionalGeneration
-def generate_caption(processor, blip_model, raw_image):
-    # unconditional image captioning
-    inputs = processor(raw_image, return_tensors="pt").to(
-        "cuda", torch.float16)
-    out = blip_model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
-def transform_image(image_pil):
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(
-        clean_state_dict(checkpoint["model"]), strict=False)
-    print(load_res)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
     with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    scores = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(
-            logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(
-                pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
-        scores.append(logit.max().item())
-    return boxes_filt, torch.Tensor(scores), pred_phrases
-def draw_mask(mask, draw, random_color=False):
-    if random_color:
-        color = (random.randint(0, 255), random.randint(
-            0, 255), random.randint(0, 255), 153)
-    else:
-        color = (30, 144, 255, 153)
-    nonzero_coords = np.transpose(np.nonzero(mask))
-    for coord in nonzero_coords:
-        draw.point(coord[::-1], fill=color)
-def draw_box(box, draw, label):
-    # random color
-    color = tuple(np.random.randint(0, 255, size=3).tolist())
-    draw.rectangle(((box[0], box[1]), (box[2], box[3])),
-                   outline=color,  width=2)
-    if label:
-        font = ImageFont.load_default()
-        if hasattr(font, "getbbox"):
-            bbox = draw.textbbox((box[0], box[1]), str(label), font)
-        else:
-            w, h = draw.textsize(str(label), font)
-            bbox = (box[0], box[1], w + box[0], box[1] + h)
-        draw.rectangle(bbox, fill=color)
-        draw.text((box[0], box[1]), str(label), fill="white")
-        draw.text((box[0], box[1]), label)
-config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
-ckpt_repo_id = "ShilongLiu/GroundingDINO"
-ckpt_filenmae = "groundingdino_swint_ogc.pth"
-sam_checkpoint = 'sam_vit_h_4b8939.pth'
-output_dir = "outputs"
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-blip_processor = None
-blip_model = None
-groundingdino_model = None
-sam_predictor = None
-inpaint_pipeline = None
-def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode):
-    global blip_processor, blip_model, groundingdino_model, sam_predictor, inpaint_pipeline
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil = input_image.convert("RGB")
-    transformed_image = transform_image(image_pil)
-    if groundingdino_model is None:
-        groundingdino_model = load_model(
-            config_file, ckpt_filenmae, device=device)
-    if task_type == 'automatic':
-        # generate caption and tags
-        # use Tag2Text can generate better captions
-        # https://huggingface.co/spaces/xinyu1205/Tag2Text
-        # but there are some bugs...
-        blip_processor = blip_processor or BlipProcessor.from_pretrained(
-            "Salesforce/blip-image-captioning-large")
-        blip_model = blip_model or BlipForConditionalGeneration.from_pretrained(
-            "Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")
-        text_prompt = generate_caption(blip_processor, blip_model, image_pil)
-        print(f"Caption: {text_prompt}")
-    # run grounding dino model
-    boxes_filt, scores, pred_phrases = get_grounding_output(
-        groundingdino_model, transformed_image, text_prompt, box_threshold, text_threshold
     )
-    size = image_pil.size
-    # process boxes
-    H, W = size[1], size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    # nms
-    print(f"Before NMS: {boxes_filt.shape[0]} boxes")
-    nms_idx = torchvision.ops.nms(
-        boxes_filt, scores, iou_threshold).numpy().tolist()
-    boxes_filt = boxes_filt[nms_idx]
-    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
-    print(f"After NMS: {boxes_filt.shape[0]} boxes")
-    if task_type == 'seg' or task_type == 'inpainting' or task_type == 'automatic':
-        if sam_predictor is None:
-            # initialize SAM
-            assert sam_checkpoint, 'sam_checkpoint is not found!'
-            sam = build_sam(checkpoint=sam_checkpoint)
-            sam.to(device=device)
-            sam_predictor = SamPredictor(sam)
-        image = np.array(image_pil)
-        sam_predictor.set_image(image)
-        if task_type == 'automatic':
-            # use NMS to handle overlapped boxes
-            print(f"Revise caption with number: {text_prompt}")
-        transformed_boxes = sam_predictor.transform.apply_boxes_torch(
-            boxes_filt, image.shape[:2]).to(device)
-        masks, _, _ = sam_predictor.predict_torch(
-            point_coords=None,
-            point_labels=None,
-            boxes=transformed_boxes,
-            multimask_output=False,
-        )
-        # masks: [1, 1, 512, 512]
-    if task_type == 'det':
-        image_draw = ImageDraw.Draw(image_pil)
-        for box, label in zip(boxes_filt, pred_phrases):
-            draw_box(box, image_draw, label)
-        return [image_pil]
-    elif task_type == 'seg' or task_type == 'automatic':
-        mask_image = Image.new('RGBA', size, color=(0, 0, 0, 0))
-        mask_draw = ImageDraw.Draw(mask_image)
-        for mask in masks:
-            draw_mask(mask[0].cpu().numpy(), mask_draw, random_color=True)
-        image_draw = ImageDraw.Draw(image_pil)
-        for box, label in zip(boxes_filt, pred_phrases):
-            draw_box(box, image_draw, label)
-        if task_type == 'automatic':
-            image_draw.text((10, 10), text_prompt, fill='black')
-        image_pil = image_pil.convert('RGBA')
-        image_pil.alpha_composite(mask_image)
-        return [image_pil, mask_image]
-    elif task_type == 'inpainting':
-        assert inpaint_prompt, 'inpaint_prompt is not found!'
-        # inpainting pipeline
-        if inpaint_mode == 'merge':
-            masks = torch.sum(masks, dim=0).unsqueeze(0)
-            masks = torch.where(masks > 0, True, False)
-        # simply choose the first mask, which will be refine in the future release
-        mask = masks[0][0].cpu().numpy()
-        mask_pil = Image.fromarray(mask)
-        if inpaint_pipeline is None:
-            inpaint_pipeline = StableDiffusionInpaintPipeline.from_pretrained(
-                "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
-            )
-            inpaint_pipeline = inpaint_pipeline.to("cuda")
-        image = inpaint_pipeline(prompt=inpaint_prompt, image=image_pil.resize(
-            (512, 512)), mask_image=mask_pil.resize((512, 512))).images[0]
-        image = image.resize(size)
-        return [image, mask_pil]
-    else:
-        print("task_type:{} error!".format(task_type))
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
-    parser.add_argument("--debug", action="store_true",
-                        help="using debug mode")
-    parser.add_argument("--share", action="store_true", help="share the app")
-    parser.add_argument('--no-gradio-queue', action="store_true",
-                        help='path to the SAM checkpoint')
-    args = parser.parse_args()
-    print(args)
-    block = gr.Blocks()
-    if not args.no_gradio_queue:
-        block = block.queue()
-with block:
-        with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(
-                    source='upload', type="pil", value="demo1.jpg")
-                task_type = gr.Dropdown(
-                    ["det", "seg", "inpainting", "automatic"], value="seg", label="task_type")
-                text_prompt = gr.Textbox(label="Text Prompt", placeholder="bear . beach .")
-                inpaint_prompt = gr.Textbox(label="Inpaint Prompt", placeholder="A dinosaur, detailed, 4K.")
-                run_button = gr.Button(label="Run")
-                with gr.Accordion("Advanced options", open=False):
-                    box_threshold = gr.Slider(
-                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
-                    )
-                    text_threshold = gr.Slider(
-                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
-                    )
-                    iou_threshold = gr.Slider(
-                        label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.001
-                    )
-                    inpaint_mode = gr.Dropdown(
-                        ["merge", "first"], value="merge", label="inpaint_mode")
-            with gr.Column():
-                gallery = gr.Gallery(
-                    label="Generated images", show_label=False, elem_id="gallery"
-                ).style(preview=True, grid=2, object_fit="scale-down")
-        run_button.click(fn=run_grounded_sam, inputs=[
-            input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode], outputs=gallery)
-    block.launch(debug=args.debug, share=args.share, show_error=True)

+import os
+import torch
 import gradio as gr
 import numpy as np
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    AutoModelForZeroShotObjectDetection,
+    BlipProcessor,
+    BlipForConditionalGeneration
+)
+from segment_anything import sam_model_registry, SamPredictor
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# --------------------------------------------------
+# MODELS
+# --------------------------------------------------
+DINO_MODEL = "IDEA-Research/grounding-dino-base"
+BLIP_MODEL = "Salesforce/blip-image-captioning-base"
+SAM_TYPE = "vit_b"
+SAM_CHECKPOINT = "sam_vit_b.pth"
+SAM_URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
+BOX_THRESHOLD = 0.3
+# --------------------------------------------------
+# DOWNLOAD SAM
+# --------------------------------------------------
+if not os.path.exists(SAM_CHECKPOINT):
+    import urllib.request
+    print("Downloading SAM checkpoint...")
+    urllib.request.urlretrieve(SAM_URL, SAM_CHECKPOINT)
+# --------------------------------------------------
+# LOAD MODELS
+# --------------------------------------------------
+print("Loading GroundingDINO...")
+processor = AutoProcessor.from_pretrained(DINO_MODEL)
+dino = AutoModelForZeroShotObjectDetection.from_pretrained(DINO_MODEL).to(DEVICE)
+print("Loading SAM...")
+sam = sam_model_registry[SAM_TYPE](checkpoint=SAM_CHECKPOINT)
+sam.to(device=DEVICE)
+predictor = SamPredictor(sam)
+print("Loading BLIP...")
+blip_processor = BlipProcessor.from_pretrained(BLIP_MODEL)
+blip_model = BlipForConditionalGeneration.from_pretrained(BLIP_MODEL).to(DEVICE)
+# --------------------------------------------------
+# BLIP CAPTION
+# --------------------------------------------------
+def generate_caption(image):
+    inputs = blip_processor(image, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        out = blip_model.generate(**inputs)
+    caption = blip_processor.decode(out[0], skip_special_tokens=True)
+    return caption
+# --------------------------------------------------
+# DETECT OBJECTS
+# --------------------------------------------------
+def detect(image, prompt):
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        outputs = dino(**inputs)
+    results = processor.post_process_grounded_object_detection(
+        outputs,
+        target_sizes=[image.size[::-1]],
+    )[0]
+    boxes = results["boxes"]
+    scores = results["scores"]
+    keep = scores > BOX_THRESHOLD
+    return boxes[keep]
+# --------------------------------------------------
+# DRAW BOXES
+# --------------------------------------------------
+def draw_boxes(image, boxes):
+    image_np = np.array(image)
+    result = image_np.copy()
+    for box in boxes:
+        x1, y1, x2, y2 = box.cpu().numpy().astype(int)
+        result[y1:y1+3, x1:x2] = [255, 0, 0]
+        result[y2:y2+3, x1:x2] = [255, 0, 0]
+        result[y1:y2, x1:x1+3] = [255, 0, 0]
+        result[y1:y2, x2:x2+3] = [255, 0, 0]
+    return Image.fromarray(result)
+# --------------------------------------------------
+# SEGMENT
+# --------------------------------------------------
+def segment(image, prompt):
+    image = image.convert("RGB")
+    image_np = np.array(image)
+    boxes = detect(image, prompt)
+    if len(boxes) == 0:
+        return image
+    predictor.set_image(image_np)
+    boxes = boxes.to(DEVICE)
+    transformed = predictor.transform.apply_boxes_torch(
+        boxes, image_np.shape[:2]
     )
+    masks, _, _ = predictor.predict_torch(
+        point_coords=None,
+        point_labels=None,
+        boxes=transformed,
+        multimask_output=False,
+    )
+    result = image_np.copy()
+    for mask in masks:
+        m = mask[0].cpu().numpy()
+        result[m > 0] = (
+            result[m > 0] * 0.5 + np.array([0, 255, 0]) * 0.5
+        ).astype(np.uint8)
+    return Image.fromarray(result)
+# --------------------------------------------------
+# PIPELINE
+# --------------------------------------------------
+def run_pipeline(image, prompt, mode):
+    if mode == "seg":
+        return segment(image, prompt)
+    if mode == "det":
+        boxes = detect(image, prompt)
+        return draw_boxes(image, boxes)
+    if mode == "automatic":
+        caption = generate_caption(image)
+        print("BLIP caption:", caption)
+        return segment(image, caption)
+# --------------------------------------------------
+# UI
+# --------------------------------------------------
+demo = gr.Interface(
+    fn=run_pipeline,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Textbox(label="Prompt", value="person"),
+        gr.Dropdown(
+            ["seg", "det", "automatic"],
+            value="seg",
+            label="Mode"
+        ),
+    ],
+    outputs=gr.Image(),
+    title="GroundingDINO + SAM + BLIP (CPU version)",
+)
 if __name__ == "__main__":
+    demo.launch()