Spaces:
Configuration error
Configuration error
File size: 6,462 Bytes
24f3fb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import numpy as np
import cv2
from PIL import Image
import clip
import torch
from .models import get_sam_predictor, get_clip, get_device, get_groundingdino_model, get_dinov2_large
from torchvision.ops import box_convert
from groundingdino.datasets.transforms import Compose, RandomResize, ToTensor, Normalize
from groundingdino.util.inference import predict as dino_predict
from torchvision import transforms
from transformers import pipeline
# Initialize an image captioning pipeline.
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
resize_transform = transforms.Compose([
transforms.Resize(512, interpolation=Image.BICUBIC)
])
def get_dinov2_transform():
"""Get DINOv2 preprocessing transform"""
transform = transforms.Compose([
transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
return transform
def get_dino_boxes_from_prompt(image_np: np.ndarray, prompt : str, box_threshold=0.3, text_threshold=0.25,) -> np.ndarray:
pil = Image.fromarray(image_np)
h, w = pil.height, pil.width
transform = Compose([
RandomResize([800], max_size=1333),
ToTensor(),
Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])
# Given your PIL image:
img_t, _ = transform(pil, None) # returns tensor[C,H,W]
img_t = img_t.to(get_device()) # move to GPU if needed
# 3) run DINO’s predict API – it will tokenize, forward, and post‐process for you :contentReference[oaicite:1]{index=1}
boxes, scores, phrases = dino_predict(
model=get_groundingdino_model(),
image=img_t,
caption=prompt,
box_threshold=box_threshold,
text_threshold=text_threshold,
device=get_device()
)
if boxes.numel() == 0:
return image_np # no detections → return original
# 4) convert normalized cxcywh → absolute xyxy pixels :contentReference[oaicite:2]{index=2}
# (boxes is tensor of shape [N,4] with values in [0,1])
boxes_abs = boxes * torch.tensor([w, h, w, h], device=boxes.device)
xyxy = box_convert(boxes=boxes_abs, in_fmt="cxcywh", out_fmt="xyxy")
sam_boxes = xyxy.cpu().numpy() # shape [N,4] in pixel coords
return sam_boxes
def get_sam_mask(image_np: np.ndarray, point_coords: np.ndarray, point_labels: np.ndarray, sam_boxes : np.ndarray) -> np.ndarray:
pred = get_sam_predictor()
pred.set_image(image_np)
masks, scores, _ = pred.predict(
point_coords=point_coords,
point_labels=point_labels,
multimask_output=False,
box=sam_boxes if sam_boxes is not None else None
)
idx = int(np.argmax(scores))
selected_mask = masks[idx] # boolean mask
# Find bounding box from mask
if selected_mask is None or selected_mask.size == 0:
bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
mask_2d = selected_mask
else:
# Remove singleton dimensions if present
mask_2d = np.squeeze(selected_mask)
if mask_2d.ndim != 2:
raise ValueError(f"Expected 2D mask, got shape {mask_2d.shape}")
ys, xs = np.where(mask_2d)
if len(xs) == 0 or len(ys) == 0:
# fallback in case mask is empty
bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
else:
x_min, x_max = int(xs.min()), int(xs.max())
y_min, y_max = int(ys.min()), int(ys.max())
bbox = {
"x": x_min,
"y": y_min,
"width": x_max - x_min,
"height": y_max - y_min
}
return mask_2d, bbox
def embed_image_dino_large(image_np: np.ndarray) -> np.ndarray:
"""
Embed image using DINOv2 large model
Args:
image_np: numpy array representing the image
Returns:
np.ndarray: normalized 1024-dimensional embedding
"""
model = get_dinov2_large()
transform = get_dinov2_transform()
device = get_device()
# Convert numpy array to PIL Image
pil = Image.fromarray(image_np)
# Apply preprocessing
inp = transform(pil).unsqueeze(0).to(device)
# Get embeddings
with torch.no_grad():
feats = model(inp).cpu().numpy()[0]
# Normalize the features
norm = np.linalg.norm(feats)
return feats / norm if norm > 0 else feats
def embed_image_clip(image_np: np.ndarray) -> np.ndarray:
model, preprocess = get_clip()
device = get_device()
pil = Image.fromarray(image_np)
inp = preprocess(pil).unsqueeze(0).to(device)
with torch.no_grad():
feats = model.encode_image(inp).cpu().numpy()[0]
norm = np.linalg.norm(feats)
return feats / norm if norm > 0 else feats
#return [{"id": r.id, "score": r.score, "payload": r.payload} for r in results]
def embed_text(text: str) -> np.ndarray:
model, preprocess = get_clip()
device = get_device()
tokens = clip.tokenize([text]).to(device)
with torch.no_grad():
text_features = model.encode_text(tokens)
return text_features[0].cpu().numpy()
def generate_description_vllm(pil_image : np.ndarray):
"""
Generate a default caption for the image using the captioning model.
"""
output = captioner(pil_image)
return output[0]['generated_text']
def expand_coords_shape(point_coords, point_labels, box_count : int):
"""
Expands point coordinates and labels to match the number of bounding boxes.
Parameters:
point_coords: Array of shape (P, 2) representing point coordinates.
point_labels: Array of shape (P,) representing point labels.
box_count: Number of bounding boxes to tile the coordinates and labels for.
Returns:
A tuple (expanded_point_coords, expanded_point_labels) where:
- expanded_point_coords is of shape (B, P, 2).
- expanded_point_labels is of shape (B, P).
"""
# Add a batch dimension
point_coords = point_coords[None, ...] # (1, P, 2)
point_labels = point_labels[None, ...] # (1, P)
# Tile to match the number of bounding boxes
point_coords = np.tile(point_coords, (box_count, 1, 1)) # (B, P, 2)
point_labels = np.tile(point_labels, (box_count, 1)) # (B, P)
return point_coords, point_labels |