| | |
| | |
| |
|
| | import os |
| | import json |
| | from typing import Any |
| | import numpy as np |
| | import random |
| | import torch |
| | import torchvision |
| | import torchvision.transforms as transforms |
| | from PIL import Image |
| | import cv2 |
| | import matplotlib.pyplot as plt |
| | from cog import BasePredictor, Input, Path, BaseModel |
| |
|
| | from subprocess import call |
| |
|
| | HOME = os.getcwd() |
| | os.chdir("GroundingDINO") |
| | call("pip install -q .", shell=True) |
| | os.chdir(HOME) |
| | os.chdir("segment_anything") |
| | call("pip install -q .", shell=True) |
| | os.chdir(HOME) |
| |
|
| | |
| | import GroundingDINO.groundingdino.datasets.transforms as T |
| | from GroundingDINO.groundingdino.models import build_model |
| | from GroundingDINO.groundingdino.util.slconfig import SLConfig |
| | from GroundingDINO.groundingdino.util.utils import ( |
| | clean_state_dict, |
| | get_phrases_from_posmap, |
| | ) |
| |
|
| | |
| | from segment_anything import build_sam, build_sam_hq, SamPredictor |
| |
|
| | from ram.models import ram |
| |
|
| |
|
| | class ModelOutput(BaseModel): |
| | tags: str |
| | rounding_box_img: Path |
| | masked_img: Path |
| | json_data: Any |
| |
|
| |
|
| | class Predictor(BasePredictor): |
| | def setup(self): |
| | """Load the model into memory to make running multiple predictions efficient""" |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | normalize = transforms.Normalize( |
| | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
| | ) |
| | self.image_size = 384 |
| | self.transform = transforms.Compose( |
| | [ |
| | transforms.Resize((self.image_size, self.image_size)), |
| | transforms.ToTensor(), |
| | normalize, |
| | ] |
| | ) |
| |
|
| | |
| | self.ram_model = ram( |
| | pretrained="pretrained/ram_swin_large_14m.pth", |
| | image_size=self.image_size, |
| | vit="swin_l", |
| | ) |
| | self.ram_model.eval() |
| | self.ram_model = self.ram_model.to(self.device) |
| |
|
| | self.model = load_model( |
| | "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", |
| | "pretrained/groundingdino_swint_ogc.pth", |
| | device=self.device, |
| | ) |
| |
|
| | self.sam = SamPredictor( |
| | build_sam(checkpoint="pretrained/sam_vit_h_4b8939.pth").to(self.device) |
| | ) |
| | self.sam_hq = SamPredictor( |
| | build_sam_hq(checkpoint="pretrained/sam_hq_vit_h.pth").to(self.device) |
| | ) |
| |
|
| | def predict( |
| | self, |
| | input_image: Path = Input(description="Input image"), |
| | use_sam_hq: bool = Input( |
| | description="Use sam_hq instead of SAM for prediction", default=False |
| | ), |
| | ) -> ModelOutput: |
| | """Run a single prediction on the model""" |
| |
|
| | |
| | box_threshold = 0.25 |
| | text_threshold = 0.2 |
| | iou_threshold = 0.5 |
| |
|
| | image_pil, image = load_image(str(input_image)) |
| |
|
| | raw_image = image_pil.resize((self.image_size, self.image_size)) |
| | raw_image = self.transform(raw_image).unsqueeze(0).to(self.device) |
| |
|
| | with torch.no_grad(): |
| | tags, tags_chinese = self.ram_model.generate_tag(raw_image) |
| |
|
| | tags = tags[0].replace(" |", ",") |
| |
|
| | |
| | boxes_filt, scores, pred_phrases = get_grounding_output( |
| | self.model, image, tags, box_threshold, text_threshold, device=self.device |
| | ) |
| |
|
| | predictor = self.sam_hq if use_sam_hq else self.sam |
| |
|
| | image = cv2.imread(str(input_image)) |
| | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
| | predictor.set_image(image) |
| |
|
| | size = image_pil.size |
| | H, W = size[1], size[0] |
| | for i in range(boxes_filt.size(0)): |
| | boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) |
| | boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 |
| | boxes_filt[i][2:] += boxes_filt[i][:2] |
| |
|
| | boxes_filt = boxes_filt.cpu() |
| | |
| | print(f"Before NMS: {boxes_filt.shape[0]} boxes") |
| | nms_idx = ( |
| | torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist() |
| | ) |
| | boxes_filt = boxes_filt[nms_idx] |
| | pred_phrases = [pred_phrases[idx] for idx in nms_idx] |
| | print(f"After NMS: {boxes_filt.shape[0]} boxes") |
| |
|
| | transformed_boxes = predictor.transform.apply_boxes_torch( |
| | boxes_filt, image.shape[:2] |
| | ).to(self.device) |
| |
|
| | masks, _, _ = predictor.predict_torch( |
| | point_coords=None, |
| | point_labels=None, |
| | boxes=transformed_boxes.to(self.device), |
| | multimask_output=False, |
| | ) |
| |
|
| | |
| | plt.figure(figsize=(10, 10)) |
| | for mask in masks: |
| | show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) |
| | for box, label in zip(boxes_filt, pred_phrases): |
| | show_box(box.numpy(), plt.gca(), label) |
| |
|
| | rounding_box_path = "/tmp/automatic_label_output.png" |
| | plt.axis("off") |
| | plt.savefig( |
| | Path(rounding_box_path), bbox_inches="tight", dpi=300, pad_inches=0.0 |
| | ) |
| | plt.close() |
| |
|
| | |
| | value = 0 |
| | mask_img = torch.zeros(masks.shape[-2:]) |
| | for idx, mask in enumerate(masks): |
| | mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1 |
| | plt.figure(figsize=(10, 10)) |
| | plt.imshow(mask_img.numpy()) |
| | plt.axis("off") |
| | masks_path = "/tmp/mask.png" |
| | plt.savefig(masks_path, bbox_inches="tight", dpi=300, pad_inches=0.0) |
| | plt.close() |
| |
|
| | json_data = { |
| | "tags": tags, |
| | "mask": [{"value": value, "label": "background"}], |
| | } |
| | for label, box in zip(pred_phrases, boxes_filt): |
| | value += 1 |
| | name, logit = label.split("(") |
| | logit = logit[:-1] |
| | json_data["mask"].append( |
| | { |
| | "value": value, |
| | "label": name, |
| | "logit": float(logit), |
| | "box": box.numpy().tolist(), |
| | } |
| | ) |
| |
|
| | json_path = "/tmp/label.json" |
| | with open(json_path, "w") as f: |
| | json.dump(json_data, f) |
| |
|
| | return ModelOutput( |
| | tags=tags, |
| | masked_img=Path(masks_path), |
| | rounding_box_img=Path(rounding_box_path), |
| | json_data=Path(json_path), |
| | ) |
| |
|
| |
|
| | def get_grounding_output( |
| | model, image, caption, box_threshold, text_threshold, device="cpu" |
| | ): |
| | caption = caption.lower() |
| | caption = caption.strip() |
| | if not caption.endswith("."): |
| | caption = caption + "." |
| | model = model.to(device) |
| | image = image.to(device) |
| | with torch.no_grad(): |
| | outputs = model(image[None], captions=[caption]) |
| | logits = outputs["pred_logits"].cpu().sigmoid()[0] |
| | boxes = outputs["pred_boxes"].cpu()[0] |
| | logits.shape[0] |
| |
|
| | |
| | logits_filt = logits.clone() |
| | boxes_filt = boxes.clone() |
| | filt_mask = logits_filt.max(dim=1)[0] > box_threshold |
| | logits_filt = logits_filt[filt_mask] |
| | boxes_filt = boxes_filt[filt_mask] |
| | logits_filt.shape[0] |
| |
|
| | |
| | tokenlizer = model.tokenizer |
| | tokenized = tokenlizer(caption) |
| | |
| | pred_phrases = [] |
| | scores = [] |
| | for logit, box in zip(logits_filt, boxes_filt): |
| | pred_phrase = get_phrases_from_posmap( |
| | logit > text_threshold, tokenized, tokenlizer |
| | ) |
| | pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") |
| | scores.append(logit.max().item()) |
| |
|
| | return boxes_filt, torch.Tensor(scores), pred_phrases |
| |
|
| |
|
| | def load_image(image_path): |
| | |
| | image_pil = Image.open(image_path).convert("RGB") |
| |
|
| | transform = T.Compose( |
| | [ |
| | T.RandomResize([800], max_size=1333), |
| | T.ToTensor(), |
| | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
| | ] |
| | ) |
| | image, _ = transform(image_pil, None) |
| | return image_pil, image |
| |
|
| |
|
| | def load_model(model_config_path, model_checkpoint_path, device): |
| | args = SLConfig.fromfile(model_config_path) |
| | args.device = device |
| | model = build_model(args) |
| | checkpoint = torch.load(model_checkpoint_path, map_location="cpu") |
| | load_res = model.load_state_dict( |
| | clean_state_dict(checkpoint["model"]), strict=False |
| | ) |
| | print(load_res) |
| | _ = model.eval() |
| | return model |
| |
|
| |
|
| | def show_mask(mask, ax, random_color=False): |
| | if random_color: |
| | color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) |
| | else: |
| | color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) |
| | h, w = mask.shape[-2:] |
| | mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) |
| | ax.imshow(mask_image) |
| |
|
| |
|
| | def show_box(box, ax, label): |
| | x0, y0 = box[0], box[1] |
| | w, h = box[2] - box[0], box[3] - box[1] |
| | ax.add_patch( |
| | plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=1.5) |
| | ) |
| | ax.text(x0, y0, label) |
| |
|