notRaphael commited on
Commit
232f64f
Β·
verified Β·
1 Parent(s): 7335d00

Add visual encoders (SigLIP2 + Grounding DINO)

Browse files
Files changed (1) hide show
  1. video_intelligence/visual_encoders.py +188 -0
video_intelligence/visual_encoders.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Intelligence Platform β€” Visual Encoders
3
+ SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
4
+ Both run on CPU (no GPU required).
5
+ """
6
+ import io
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import numpy as np
10
+ from PIL import Image
11
+ from typing import List, Dict, Optional, Tuple
12
+ from dataclasses import dataclass
13
+
14
+
15
+ @dataclass
16
+ class Detection:
17
+ """A single object detection with attributes."""
18
+ label: str
19
+ confidence: float
20
+ bbox: List[float] # [x0, y0, x1, y1] in absolute pixels
21
+ timestamp_sec: float = 0.0
22
+
23
+
24
+ class SigLIPEncoder:
25
+ """
26
+ SigLIP2 encoder for frame β†’ embedding and text β†’ embedding.
27
+ Shared embedding space enables cross-modal similarity search.
28
+ """
29
+
30
+ def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
31
+ device: str = "cpu"):
32
+ from transformers import AutoModel, AutoProcessor
33
+
34
+ print(f"πŸ”„ Loading SigLIP2 ({model_name}) on {device}...")
35
+ self.processor = AutoProcessor.from_pretrained(model_name)
36
+ self.model = AutoModel.from_pretrained(
37
+ model_name, torch_dtype=torch.float32
38
+ ).to(device).eval()
39
+ self.device = device
40
+ self.embedding_dim = 1152
41
+ print(f" βœ… SigLIP2 loaded (dim={self.embedding_dim})")
42
+
43
+ @torch.no_grad()
44
+ def embed_frames(self, images: List[Image.Image],
45
+ batch_size: int = 8) -> np.ndarray:
46
+ """
47
+ Embed a list of PIL images into normalized vectors.
48
+
49
+ Returns:
50
+ np.ndarray of shape [N, 1152], L2-normalized
51
+ """
52
+ all_embeddings = []
53
+
54
+ for i in range(0, len(images), batch_size):
55
+ batch = images[i:i + batch_size]
56
+ inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
57
+ outputs = self.model.get_image_features(**inputs)
58
+ embeddings = outputs.pooler_output # [B, 1152]
59
+ embeddings = F.normalize(embeddings, dim=-1)
60
+ all_embeddings.append(embeddings.cpu().numpy())
61
+
62
+ return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim))
63
+
64
+ @torch.no_grad()
65
+ def embed_texts(self, texts: List[str]) -> np.ndarray:
66
+ """
67
+ Embed text queries into the same space as frames.
68
+
69
+ Returns:
70
+ np.ndarray of shape [N, 1152], L2-normalized
71
+ """
72
+ if not texts:
73
+ return np.empty((0, self.embedding_dim))
74
+
75
+ inputs = self.processor(
76
+ text=texts,
77
+ padding="max_length", # CRITICAL: required for SigLIP
78
+ return_tensors="pt",
79
+ ).to(self.device)
80
+ outputs = self.model.get_text_features(**inputs)
81
+ embeddings = outputs.pooler_output # [N, 1152]
82
+ embeddings = F.normalize(embeddings, dim=-1)
83
+ return embeddings.cpu().numpy()
84
+
85
+ @torch.no_grad()
86
+ def compute_similarity(self, frame_embeddings: np.ndarray,
87
+ text_embeddings: np.ndarray) -> np.ndarray:
88
+ """
89
+ Compute cosine similarity between frame and text embeddings.
90
+ Uses sigmoid (SigLIP objective) for per-pair probabilities.
91
+
92
+ Returns:
93
+ np.ndarray of shape [num_frames, num_texts]
94
+ """
95
+ # Cosine similarity (embeddings are already L2-normalized)
96
+ similarity = frame_embeddings @ text_embeddings.T
97
+ # SigLIP uses sigmoid, not softmax
98
+ return 1 / (1 + np.exp(-similarity * 5.0)) # approximate logit_scale
99
+
100
+
101
+ class GroundingDINODetector:
102
+ """
103
+ Grounding DINO for open-vocabulary object detection with attribute queries.
104
+ Supports complex queries like "person wearing white clothes", "red car", etc.
105
+ """
106
+
107
+ def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
108
+ device: str = "cpu",
109
+ box_threshold: float = 0.35,
110
+ text_threshold: float = 0.25):
111
+ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
112
+
113
+ print(f"πŸ”„ Loading Grounding DINO ({model_name}) on {device}...")
114
+ self.processor = AutoProcessor.from_pretrained(model_name)
115
+ self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
116
+ model_name
117
+ ).to(device).eval()
118
+ self.device = device
119
+ self.box_threshold = box_threshold
120
+ self.text_threshold = text_threshold
121
+ print(f" βœ… Grounding DINO loaded")
122
+
123
+ def _format_query(self, labels: List[str]) -> str:
124
+ """
125
+ Format labels into Grounding DINO query format.
126
+ Rules: lowercase, each label ends with ' . '
127
+
128
+ Example: ["person in white", "red car"] β†’ "person in white . red car ."
129
+ """
130
+ formatted = " . ".join(l.lower().strip() for l in labels) + " ."
131
+ return formatted
132
+
133
+ @torch.no_grad()
134
+ def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
135
+ """
136
+ Detect objects matching the given text labels in an image.
137
+
138
+ Args:
139
+ image: PIL Image
140
+ labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
141
+
142
+ Returns:
143
+ List of Detection objects with labels, confidence, and bounding boxes
144
+ """
145
+ text_query = self._format_query(labels)
146
+
147
+ inputs = self.processor(
148
+ images=image,
149
+ text=text_query,
150
+ return_tensors="pt",
151
+ ).to(self.device)
152
+
153
+ outputs = self.model(**inputs)
154
+
155
+ results = self.processor.post_process_grounded_object_detection(
156
+ outputs,
157
+ inputs.input_ids,
158
+ threshold=self.box_threshold,
159
+ text_threshold=self.text_threshold,
160
+ target_sizes=[image.size[::-1]], # (height, width)
161
+ )
162
+
163
+ detections = []
164
+ if results:
165
+ result = results[0]
166
+ for box, score, text_label in zip(
167
+ result["boxes"], result["scores"], result["text_labels"]
168
+ ):
169
+ detections.append(Detection(
170
+ label=text_label,
171
+ confidence=float(score),
172
+ bbox=[round(x, 2) for x in box.tolist()],
173
+ ))
174
+
175
+ return detections
176
+
177
+ @torch.no_grad()
178
+ def detect_default_attributes(self, image: Image.Image) -> List[Detection]:
179
+ """
180
+ Run detection with a comprehensive set of default attribute queries.
181
+ This indexes everything visible in the frame.
182
+ """
183
+ default_labels = [
184
+ "person", "car", "truck", "bicycle", "motorcycle",
185
+ "dog", "cat", "bird", "chair", "table",
186
+ "building", "tree", "sign", "phone", "bag",
187
+ ]
188
+ return self.detect(image, default_labels)