Spaces:

dwellbot
/

object-memory

Configuration error

File size: 6,462 Bytes

24f3fb6

import numpy as np
import cv2
from PIL import Image
import clip
import torch
from .models import get_sam_predictor, get_clip, get_device, get_groundingdino_model, get_dinov2_large
from torchvision.ops import box_convert
from groundingdino.datasets.transforms import Compose, RandomResize, ToTensor, Normalize
from groundingdino.util.inference import predict as dino_predict
from torchvision import transforms
from transformers import pipeline


# Initialize an image captioning pipeline.
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

resize_transform = transforms.Compose([
    transforms.Resize(512, interpolation=Image.BICUBIC)
])


def get_dinov2_transform():
    """Get DINOv2 preprocessing transform"""
    transform = transforms.Compose([
        transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform

def get_dino_boxes_from_prompt(image_np: np.ndarray, prompt : str, box_threshold=0.3, text_threshold=0.25,) -> np.ndarray:
    pil = Image.fromarray(image_np)
    h, w = pil.height, pil.width

    transform = Compose([
        RandomResize([800], max_size=1333),
        ToTensor(),
        Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
    ])

    # Given your PIL image:
    img_t, _ = transform(pil, None)      # returns tensor[C,H,W]
    img_t = img_t.to(get_device())                  # move to GPU if needed

    # 3) run DINO’s predict API – it will tokenize, forward, and post‐process for you :contentReference[oaicite:1]{index=1}
    boxes, scores, phrases = dino_predict(
        model=get_groundingdino_model(),
        image=img_t,
        caption=prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        device=get_device()
    )

    if boxes.numel() == 0:
        return image_np  # no detections → return original

    # 4) convert normalized cxcywh → absolute xyxy pixels :contentReference[oaicite:2]{index=2}
    #    (boxes is tensor of shape [N,4] with values in [0,1])
    boxes_abs = boxes * torch.tensor([w, h, w, h], device=boxes.device)
    xyxy = box_convert(boxes=boxes_abs, in_fmt="cxcywh", out_fmt="xyxy")
    sam_boxes = xyxy.cpu().numpy()  # shape [N,4] in pixel coords

    return sam_boxes

def get_sam_mask(image_np: np.ndarray, point_coords: np.ndarray, point_labels: np.ndarray, sam_boxes : np.ndarray) -> np.ndarray:
    pred = get_sam_predictor()
    pred.set_image(image_np)
    masks, scores, _ = pred.predict(
        point_coords=point_coords, 
        point_labels=point_labels,
        multimask_output=False,
        box=sam_boxes if sam_boxes is not None else None
    )
    idx = int(np.argmax(scores))
    selected_mask = masks[idx]  # boolean mask

    # Find bounding box from mask
    if selected_mask is None or selected_mask.size == 0:
        bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
        mask_2d = selected_mask
    else:
        # Remove singleton dimensions if present
        mask_2d = np.squeeze(selected_mask)
        
        if mask_2d.ndim != 2:
            raise ValueError(f"Expected 2D mask, got shape {mask_2d.shape}")
    
        ys, xs = np.where(mask_2d)
        if len(xs) == 0 or len(ys) == 0:
            # fallback in case mask is empty
            bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
        else:
            
            x_min, x_max = int(xs.min()), int(xs.max())
            y_min, y_max = int(ys.min()), int(ys.max())
            bbox = {
                "x": x_min,
                "y": y_min,
                "width": x_max - x_min,
                "height": y_max - y_min
            }

    return mask_2d, bbox

def embed_image_dino_large(image_np: np.ndarray) -> np.ndarray:
    """
    Embed image using DINOv2 large model
    
    Args:
        image_np: numpy array representing the image
        
    Returns:
        np.ndarray: normalized 1024-dimensional embedding
    """
    model = get_dinov2_large()
    transform = get_dinov2_transform()
    device = get_device()
    
   
    # Convert numpy array to PIL Image
    pil = Image.fromarray(image_np)
    
    # Apply preprocessing
    inp = transform(pil).unsqueeze(0).to(device)
    
    # Get embeddings
    with torch.no_grad():
        feats = model(inp).cpu().numpy()[0]
    
    # Normalize the features
    norm = np.linalg.norm(feats)
    return feats / norm if norm > 0 else feats

def embed_image_clip(image_np: np.ndarray) -> np.ndarray:
    model, preprocess = get_clip()
    device = get_device()
    pil = Image.fromarray(image_np)
    inp = preprocess(pil).unsqueeze(0).to(device)
    with torch.no_grad():
        feats = model.encode_image(inp).cpu().numpy()[0]
    norm = np.linalg.norm(feats)
    return feats / norm if norm > 0 else feats
    #return [{"id": r.id, "score": r.score, "payload": r.payload} for r in results]

def embed_text(text: str) -> np.ndarray:
    model, preprocess = get_clip()
    device = get_device()
    tokens = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(tokens)
    return text_features[0].cpu().numpy()

def generate_description_vllm(pil_image : np.ndarray):
    """
    Generate a default caption for the image using the captioning model.
    """
    output = captioner(pil_image)
    return output[0]['generated_text']


def expand_coords_shape(point_coords, point_labels, box_count : int):
    """
    Expands point coordinates and labels to match the number of bounding boxes.

    Parameters:
      point_coords: Array of shape (P, 2) representing point coordinates.
      point_labels: Array of shape (P,) representing point labels.
      box_count: Number of bounding boxes to tile the coordinates and labels for.

    Returns:
      A tuple (expanded_point_coords, expanded_point_labels) where:
        - expanded_point_coords is of shape (B, P, 2).
        - expanded_point_labels is of shape (B, P).
    """
    # Add a batch dimension
    point_coords = point_coords[None, ...]  # (1, P, 2)
    point_labels = point_labels[None, ...]  # (1, P)

    # Tile to match the number of bounding boxes
    point_coords = np.tile(point_coords, (box_count, 1, 1))  # (B, P, 2)
    point_labels = np.tile(point_labels, (box_count, 1))     # (B, P)

    return point_coords, point_labels