File size: 6,462 Bytes
24f3fb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import numpy as np
import cv2
from PIL import Image
import clip
import torch
from .models import get_sam_predictor, get_clip, get_device, get_groundingdino_model, get_dinov2_large
from torchvision.ops import box_convert
from groundingdino.datasets.transforms import Compose, RandomResize, ToTensor, Normalize
from groundingdino.util.inference import predict as dino_predict
from torchvision import transforms
from transformers import pipeline


# Initialize an image captioning pipeline.
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

resize_transform = transforms.Compose([
    transforms.Resize(512, interpolation=Image.BICUBIC)
])


def get_dinov2_transform():
    """Get DINOv2 preprocessing transform"""
    transform = transforms.Compose([
        transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform

def get_dino_boxes_from_prompt(image_np: np.ndarray, prompt : str, box_threshold=0.3, text_threshold=0.25,) -> np.ndarray:
    pil = Image.fromarray(image_np)
    h, w = pil.height, pil.width

    transform = Compose([
        RandomResize([800], max_size=1333),
        ToTensor(),
        Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
    ])

    # Given your PIL image:
    img_t, _ = transform(pil, None)      # returns tensor[C,H,W]
    img_t = img_t.to(get_device())                  # move to GPU if needed

    # 3) run DINO’s predict API – it will tokenize, forward, and post‐process for you :contentReference[oaicite:1]{index=1}
    boxes, scores, phrases = dino_predict(
        model=get_groundingdino_model(),
        image=img_t,
        caption=prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        device=get_device()
    )

    if boxes.numel() == 0:
        return image_np  # no detections → return original

    # 4) convert normalized cxcywh → absolute xyxy pixels :contentReference[oaicite:2]{index=2}
    #    (boxes is tensor of shape [N,4] with values in [0,1])
    boxes_abs = boxes * torch.tensor([w, h, w, h], device=boxes.device)
    xyxy = box_convert(boxes=boxes_abs, in_fmt="cxcywh", out_fmt="xyxy")
    sam_boxes = xyxy.cpu().numpy()  # shape [N,4] in pixel coords

    return sam_boxes

def get_sam_mask(image_np: np.ndarray, point_coords: np.ndarray, point_labels: np.ndarray, sam_boxes : np.ndarray) -> np.ndarray:
    pred = get_sam_predictor()
    pred.set_image(image_np)
    masks, scores, _ = pred.predict(
        point_coords=point_coords, 
        point_labels=point_labels,
        multimask_output=False,
        box=sam_boxes if sam_boxes is not None else None
    )
    idx = int(np.argmax(scores))
    selected_mask = masks[idx]  # boolean mask

    # Find bounding box from mask
    if selected_mask is None or selected_mask.size == 0:
        bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
        mask_2d = selected_mask
    else:
        # Remove singleton dimensions if present
        mask_2d = np.squeeze(selected_mask)
        
        if mask_2d.ndim != 2:
            raise ValueError(f"Expected 2D mask, got shape {mask_2d.shape}")
    
        ys, xs = np.where(mask_2d)
        if len(xs) == 0 or len(ys) == 0:
            # fallback in case mask is empty
            bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
        else:
            
            x_min, x_max = int(xs.min()), int(xs.max())
            y_min, y_max = int(ys.min()), int(ys.max())
            bbox = {
                "x": x_min,
                "y": y_min,
                "width": x_max - x_min,
                "height": y_max - y_min
            }

    return mask_2d, bbox

def embed_image_dino_large(image_np: np.ndarray) -> np.ndarray:
    """
    Embed image using DINOv2 large model
    
    Args:
        image_np: numpy array representing the image
        
    Returns:
        np.ndarray: normalized 1024-dimensional embedding
    """
    model = get_dinov2_large()
    transform = get_dinov2_transform()
    device = get_device()
    
   
    # Convert numpy array to PIL Image
    pil = Image.fromarray(image_np)
    
    # Apply preprocessing
    inp = transform(pil).unsqueeze(0).to(device)
    
    # Get embeddings
    with torch.no_grad():
        feats = model(inp).cpu().numpy()[0]
    
    # Normalize the features
    norm = np.linalg.norm(feats)
    return feats / norm if norm > 0 else feats

def embed_image_clip(image_np: np.ndarray) -> np.ndarray:
    model, preprocess = get_clip()
    device = get_device()
    pil = Image.fromarray(image_np)
    inp = preprocess(pil).unsqueeze(0).to(device)
    with torch.no_grad():
        feats = model.encode_image(inp).cpu().numpy()[0]
    norm = np.linalg.norm(feats)
    return feats / norm if norm > 0 else feats
    #return [{"id": r.id, "score": r.score, "payload": r.payload} for r in results]

def embed_text(text: str) -> np.ndarray:
    model, preprocess = get_clip()
    device = get_device()
    tokens = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(tokens)
    return text_features[0].cpu().numpy()

def generate_description_vllm(pil_image : np.ndarray):
    """
    Generate a default caption for the image using the captioning model.
    """
    output = captioner(pil_image)
    return output[0]['generated_text']


def expand_coords_shape(point_coords, point_labels, box_count : int):
    """
    Expands point coordinates and labels to match the number of bounding boxes.

    Parameters:
      point_coords: Array of shape (P, 2) representing point coordinates.
      point_labels: Array of shape (P,) representing point labels.
      box_count: Number of bounding boxes to tile the coordinates and labels for.

    Returns:
      A tuple (expanded_point_coords, expanded_point_labels) where:
        - expanded_point_coords is of shape (B, P, 2).
        - expanded_point_labels is of shape (B, P).
    """
    # Add a batch dimension
    point_coords = point_coords[None, ...]  # (1, P, 2)
    point_labels = point_labels[None, ...]  # (1, P)

    # Tile to match the number of bounding boxes
    point_coords = np.tile(point_coords, (box_count, 1, 1))  # (B, P, 2)
    point_labels = np.tile(point_labels, (box_count, 1))     # (B, P)

    return point_coords, point_labels