File size: 8,086 Bytes
232f64f
 
 
abb7d19
 
 
 
 
 
 
232f64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb7d19
 
 
 
 
 
 
232f64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb7d19
232f64f
 
abb7d19
232f64f
 
 
 
 
 
 
 
abb7d19
232f64f
 
 
 
 
 
 
 
 
 
 
abb7d19
232f64f
 
 
 
 
 
 
 
 
 
 
abb7d19
232f64f
 
 
 
 
 
 
abb7d19
232f64f
 
 
abb7d19
232f64f
 
 
 
 
 
 
 
 
 
 
 
 
abb7d19
 
 
 
 
 
 
232f64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb7d19
232f64f
 
 
abb7d19
232f64f
 
 
abb7d19
 
 
232f64f
 
 
 
 
 
 
 
 
abb7d19
 
232f64f
 
 
 
abb7d19
232f64f
 
 
 
 
abb7d19
 
232f64f
abb7d19
232f64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
Video Intelligence Platform β€” Visual Encoders
SigLIP2 for frame embeddings + Grounding DINO for attribute detection.

Verified against transformers >= 5.x API (Apr 2026):
- SigLIP2: AutoModel + AutoProcessor β†’ SiglipModel, get_image_features returns
  BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
- Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
  post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
  returns dict with "text_labels" and "labels" keys
"""
import io
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass


@dataclass
class Detection:
    """A single object detection with attributes."""
    label: str
    confidence: float
    bbox: List[float]  # [x0, y0, x1, y1] in absolute pixels
    timestamp_sec: float = 0.0


class SigLIPEncoder:
    """
    SigLIP2 encoder for frame β†’ embedding and text β†’ embedding.
    Shared embedding space enables cross-modal similarity search.
    
    Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
    Key details:
    - get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
    - .pooler_output gives [B, 1152] pooled representation
    - Text MUST use padding="max_length" (SigLIP training requirement)
    - Use sigmoid (not softmax) for similarity scores
    """

    def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
                 device: str = "cpu"):
        from transformers import AutoModel, AutoProcessor

        print(f"πŸ”„ Loading SigLIP2 ({model_name}) on {device}...")
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(
            model_name, torch_dtype=torch.float32
        ).to(device).eval()
        self.device = device
        self.embedding_dim = 1152
        print(f"   βœ… SigLIP2 loaded (dim={self.embedding_dim})")

    @torch.no_grad()
    def embed_frames(self, images: List[Image.Image],
                     batch_size: int = 8) -> np.ndarray:
        """
        Embed a list of PIL images into normalized vectors.

        Returns:
            np.ndarray of shape [N, 1152], L2-normalized
        """
        all_embeddings = []

        for i in range(0, len(images), batch_size):
            batch = images[i:i + batch_size]
            inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
            # get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
            outputs = self.model.get_image_features(**inputs)
            embeddings = outputs.pooler_output  # [B, 1152]
            embeddings = F.normalize(embeddings, dim=-1)
            all_embeddings.append(embeddings.cpu().numpy())

        return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim))

    @torch.no_grad()
    def embed_texts(self, texts: List[str]) -> np.ndarray:
        """
        Embed text queries into the same space as frames.

        Returns:
            np.ndarray of shape [N, 1152], L2-normalized
        """
        if not texts:
            return np.empty((0, self.embedding_dim))

        inputs = self.processor(
            text=texts,
            padding="max_length",  # CRITICAL: required for SigLIP
            return_tensors="pt",
        ).to(self.device)
        # get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
        outputs = self.model.get_text_features(**inputs)
        embeddings = outputs.pooler_output  # [N, 1152]
        embeddings = F.normalize(embeddings, dim=-1)
        return embeddings.cpu().numpy()

    @torch.no_grad()
    def compute_similarity(self, frame_embeddings: np.ndarray,
                           text_embeddings: np.ndarray) -> np.ndarray:
        """
        Compute cosine similarity between frame and text embeddings.
        Uses sigmoid (SigLIP objective) for per-pair probabilities.

        Returns:
            np.ndarray of shape [num_frames, num_texts]
        """
        # Cosine similarity (embeddings are already L2-normalized)
        similarity = frame_embeddings @ text_embeddings.T
        # SigLIP uses sigmoid, not softmax
        return 1 / (1 + np.exp(-similarity * 5.0))  # approximate logit_scale


class GroundingDINODetector:
    """
    Grounding DINO for open-vocabulary object detection with attribute queries.
    Supports complex queries like "person wearing white clothes", "red car", etc.

    Model: IDEA-Research/grounding-dino-tiny
    Key details (transformers >= 5.x):
    - Processor's __call__ accepts text as str, list[str], or list[list[str]]
      and auto-converts to the "label1 . label2 ." format internally
    - post_process_grounded_object_detection: input_ids is optional,
      uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
    """

    def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
                 device: str = "cpu",
                 box_threshold: float = 0.35,
                 text_threshold: float = 0.25):
        from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

        print(f"πŸ”„ Loading Grounding DINO ({model_name}) on {device}...")
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
            model_name
        ).to(device).eval()
        self.device = device
        self.box_threshold = box_threshold
        self.text_threshold = text_threshold
        print(f"   βœ… Grounding DINO loaded")

    @torch.no_grad()
    def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
        """
        Detect objects matching the given text labels in an image.

        Args:
            image: PIL Image
            labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]

        Returns:
            List of Detection objects with labels, confidence, and bounding boxes
        """
        # Processor accepts list of labels directly and converts to correct format
        # Also accepts the "label1 . label2 ." string format
        text_query = [l.lower().strip() for l in labels]

        inputs = self.processor(
            images=image,
            text=text_query,
            return_tensors="pt",
        ).to(self.device)

        outputs = self.model(**inputs)

        # transformers >= 5.x: threshold (not box_threshold), input_ids optional
        # target_sizes expects (height, width)
        results = self.processor.post_process_grounded_object_detection(
            outputs,
            threshold=self.box_threshold,
            text_threshold=self.text_threshold,
            target_sizes=[(image.height, image.width)],
        )

        detections = []
        if results:
            result = results[0]
            # Both "text_labels" and "labels" exist in current API
            label_key = "text_labels" if "text_labels" in result else "labels"
            for box, score, text_label in zip(
                result["boxes"], result["scores"], result[label_key]
            ):
                detections.append(Detection(
                    label=text_label,
                    confidence=float(score),
                    bbox=[round(x, 2) for x in box.tolist()],
                ))

        return detections

    @torch.no_grad()
    def detect_default_attributes(self, image: Image.Image) -> List[Detection]:
        """
        Run detection with a comprehensive set of default attribute queries.
        This indexes everything visible in the frame.
        """
        default_labels = [
            "person", "car", "truck", "bicycle", "motorcycle",
            "dog", "cat", "bird", "chair", "table",
            "building", "tree", "sign", "phone", "bag",
        ]
        return self.detect(image, default_labels)