import numpy as np import cv2 from PIL import Image import clip import torch from .models import get_sam_predictor, get_clip, get_device, get_groundingdino_model, get_dinov2_large from torchvision.ops import box_convert from groundingdino.datasets.transforms import Compose, RandomResize, ToTensor, Normalize from groundingdino.util.inference import predict as dino_predict from torchvision import transforms from transformers import pipeline # Initialize an image captioning pipeline. captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") resize_transform = transforms.Compose([ transforms.Resize(512, interpolation=Image.BICUBIC) ]) def get_dinov2_transform(): """Get DINOv2 preprocessing transform""" transform = transforms.Compose([ transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) return transform def get_dino_boxes_from_prompt(image_np: np.ndarray, prompt : str, box_threshold=0.3, text_threshold=0.25,) -> np.ndarray: pil = Image.fromarray(image_np) h, w = pil.height, pil.width transform = Compose([ RandomResize([800], max_size=1333), ToTensor(), Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]) ]) # Given your PIL image: img_t, _ = transform(pil, None) # returns tensor[C,H,W] img_t = img_t.to(get_device()) # move to GPU if needed # 3) run DINO’s predict API – it will tokenize, forward, and post‐process for you :contentReference[oaicite:1]{index=1} boxes, scores, phrases = dino_predict( model=get_groundingdino_model(), image=img_t, caption=prompt, box_threshold=box_threshold, text_threshold=text_threshold, device=get_device() ) if boxes.numel() == 0: return image_np # no detections → return original # 4) convert normalized cxcywh → absolute xyxy pixels :contentReference[oaicite:2]{index=2} # (boxes is tensor of shape [N,4] with values in [0,1]) boxes_abs = boxes * torch.tensor([w, h, w, h], device=boxes.device) xyxy = box_convert(boxes=boxes_abs, in_fmt="cxcywh", out_fmt="xyxy") sam_boxes = xyxy.cpu().numpy() # shape [N,4] in pixel coords return sam_boxes def get_sam_mask(image_np: np.ndarray, point_coords: np.ndarray, point_labels: np.ndarray, sam_boxes : np.ndarray) -> np.ndarray: pred = get_sam_predictor() pred.set_image(image_np) masks, scores, _ = pred.predict( point_coords=point_coords, point_labels=point_labels, multimask_output=False, box=sam_boxes if sam_boxes is not None else None ) idx = int(np.argmax(scores)) selected_mask = masks[idx] # boolean mask # Find bounding box from mask if selected_mask is None or selected_mask.size == 0: bbox = {"x": 0, "y": 0, "width": 0, "height": 0} mask_2d = selected_mask else: # Remove singleton dimensions if present mask_2d = np.squeeze(selected_mask) if mask_2d.ndim != 2: raise ValueError(f"Expected 2D mask, got shape {mask_2d.shape}") ys, xs = np.where(mask_2d) if len(xs) == 0 or len(ys) == 0: # fallback in case mask is empty bbox = {"x": 0, "y": 0, "width": 0, "height": 0} else: x_min, x_max = int(xs.min()), int(xs.max()) y_min, y_max = int(ys.min()), int(ys.max()) bbox = { "x": x_min, "y": y_min, "width": x_max - x_min, "height": y_max - y_min } return mask_2d, bbox def embed_image_dino_large(image_np: np.ndarray) -> np.ndarray: """ Embed image using DINOv2 large model Args: image_np: numpy array representing the image Returns: np.ndarray: normalized 1024-dimensional embedding """ model = get_dinov2_large() transform = get_dinov2_transform() device = get_device() # Convert numpy array to PIL Image pil = Image.fromarray(image_np) # Apply preprocessing inp = transform(pil).unsqueeze(0).to(device) # Get embeddings with torch.no_grad(): feats = model(inp).cpu().numpy()[0] # Normalize the features norm = np.linalg.norm(feats) return feats / norm if norm > 0 else feats def embed_image_clip(image_np: np.ndarray) -> np.ndarray: model, preprocess = get_clip() device = get_device() pil = Image.fromarray(image_np) inp = preprocess(pil).unsqueeze(0).to(device) with torch.no_grad(): feats = model.encode_image(inp).cpu().numpy()[0] norm = np.linalg.norm(feats) return feats / norm if norm > 0 else feats #return [{"id": r.id, "score": r.score, "payload": r.payload} for r in results] def embed_text(text: str) -> np.ndarray: model, preprocess = get_clip() device = get_device() tokens = clip.tokenize([text]).to(device) with torch.no_grad(): text_features = model.encode_text(tokens) return text_features[0].cpu().numpy() def generate_description_vllm(pil_image : np.ndarray): """ Generate a default caption for the image using the captioning model. """ output = captioner(pil_image) return output[0]['generated_text'] def expand_coords_shape(point_coords, point_labels, box_count : int): """ Expands point coordinates and labels to match the number of bounding boxes. Parameters: point_coords: Array of shape (P, 2) representing point coordinates. point_labels: Array of shape (P,) representing point labels. box_count: Number of bounding boxes to tile the coordinates and labels for. Returns: A tuple (expanded_point_coords, expanded_point_labels) where: - expanded_point_coords is of shape (B, P, 2). - expanded_point_labels is of shape (B, P). """ # Add a batch dimension point_coords = point_coords[None, ...] # (1, P, 2) point_labels = point_labels[None, ...] # (1, P) # Tile to match the number of bounding boxes point_coords = np.tile(point_coords, (box_count, 1, 1)) # (B, P, 2) point_labels = np.tile(point_labels, (box_count, 1)) # (B, P) return point_coords, point_labels