fix: update visual_encoders.py - verified API for transformers 5.x (SigLIP2 + Grounding DINO)
Browse files
video_intelligence/visual_encoders.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
| 1 |
"""
|
| 2 |
Video Intelligence Platform — Visual Encoders
|
| 3 |
SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
import io
|
| 7 |
import torch
|
|
@@ -25,6 +31,13 @@ class SigLIPEncoder:
|
|
| 25 |
"""
|
| 26 |
SigLIP2 encoder for frame → embedding and text → embedding.
|
| 27 |
Shared embedding space enables cross-modal similarity search.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
|
| 30 |
def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
|
|
@@ -42,10 +55,10 @@ class SigLIPEncoder:
|
|
| 42 |
|
| 43 |
@torch.no_grad()
|
| 44 |
def embed_frames(self, images: List[Image.Image],
|
| 45 |
-
|
| 46 |
"""
|
| 47 |
Embed a list of PIL images into normalized vectors.
|
| 48 |
-
|
| 49 |
Returns:
|
| 50 |
np.ndarray of shape [N, 1152], L2-normalized
|
| 51 |
"""
|
|
@@ -54,6 +67,7 @@ class SigLIPEncoder:
|
|
| 54 |
for i in range(0, len(images), batch_size):
|
| 55 |
batch = images[i:i + batch_size]
|
| 56 |
inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
|
|
|
|
| 57 |
outputs = self.model.get_image_features(**inputs)
|
| 58 |
embeddings = outputs.pooler_output # [B, 1152]
|
| 59 |
embeddings = F.normalize(embeddings, dim=-1)
|
|
@@ -65,7 +79,7 @@ class SigLIPEncoder:
|
|
| 65 |
def embed_texts(self, texts: List[str]) -> np.ndarray:
|
| 66 |
"""
|
| 67 |
Embed text queries into the same space as frames.
|
| 68 |
-
|
| 69 |
Returns:
|
| 70 |
np.ndarray of shape [N, 1152], L2-normalized
|
| 71 |
"""
|
|
@@ -77,6 +91,7 @@ class SigLIPEncoder:
|
|
| 77 |
padding="max_length", # CRITICAL: required for SigLIP
|
| 78 |
return_tensors="pt",
|
| 79 |
).to(self.device)
|
|
|
|
| 80 |
outputs = self.model.get_text_features(**inputs)
|
| 81 |
embeddings = outputs.pooler_output # [N, 1152]
|
| 82 |
embeddings = F.normalize(embeddings, dim=-1)
|
|
@@ -84,11 +99,11 @@ class SigLIPEncoder:
|
|
| 84 |
|
| 85 |
@torch.no_grad()
|
| 86 |
def compute_similarity(self, frame_embeddings: np.ndarray,
|
| 87 |
-
|
| 88 |
"""
|
| 89 |
Compute cosine similarity between frame and text embeddings.
|
| 90 |
Uses sigmoid (SigLIP objective) for per-pair probabilities.
|
| 91 |
-
|
| 92 |
Returns:
|
| 93 |
np.ndarray of shape [num_frames, num_texts]
|
| 94 |
"""
|
|
@@ -102,6 +117,13 @@ class GroundingDINODetector:
|
|
| 102 |
"""
|
| 103 |
Grounding DINO for open-vocabulary object detection with attribute queries.
|
| 104 |
Supports complex queries like "person wearing white clothes", "red car", etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
"""
|
| 106 |
|
| 107 |
def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
|
|
@@ -120,29 +142,21 @@ class GroundingDINODetector:
|
|
| 120 |
self.text_threshold = text_threshold
|
| 121 |
print(f" ✅ Grounding DINO loaded")
|
| 122 |
|
| 123 |
-
def _format_query(self, labels: List[str]) -> str:
|
| 124 |
-
"""
|
| 125 |
-
Format labels into Grounding DINO query format.
|
| 126 |
-
Rules: lowercase, each label ends with ' . '
|
| 127 |
-
|
| 128 |
-
Example: ["person in white", "red car"] → "person in white . red car ."
|
| 129 |
-
"""
|
| 130 |
-
formatted = " . ".join(l.lower().strip() for l in labels) + " ."
|
| 131 |
-
return formatted
|
| 132 |
-
|
| 133 |
@torch.no_grad()
|
| 134 |
def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
|
| 135 |
"""
|
| 136 |
Detect objects matching the given text labels in an image.
|
| 137 |
-
|
| 138 |
Args:
|
| 139 |
image: PIL Image
|
| 140 |
labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
|
| 141 |
-
|
| 142 |
Returns:
|
| 143 |
List of Detection objects with labels, confidence, and bounding boxes
|
| 144 |
"""
|
| 145 |
-
|
|
|
|
|
|
|
| 146 |
|
| 147 |
inputs = self.processor(
|
| 148 |
images=image,
|
|
@@ -152,19 +166,22 @@ class GroundingDINODetector:
|
|
| 152 |
|
| 153 |
outputs = self.model(**inputs)
|
| 154 |
|
|
|
|
|
|
|
| 155 |
results = self.processor.post_process_grounded_object_detection(
|
| 156 |
outputs,
|
| 157 |
-
inputs.input_ids,
|
| 158 |
threshold=self.box_threshold,
|
| 159 |
text_threshold=self.text_threshold,
|
| 160 |
-
target_sizes=[image.
|
| 161 |
)
|
| 162 |
|
| 163 |
detections = []
|
| 164 |
if results:
|
| 165 |
result = results[0]
|
|
|
|
|
|
|
| 166 |
for box, score, text_label in zip(
|
| 167 |
-
result["boxes"], result["scores"], result[
|
| 168 |
):
|
| 169 |
detections.append(Detection(
|
| 170 |
label=text_label,
|
|
|
|
| 1 |
"""
|
| 2 |
Video Intelligence Platform — Visual Encoders
|
| 3 |
SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
|
| 4 |
+
|
| 5 |
+
Verified against transformers >= 5.x API (Apr 2026):
|
| 6 |
+
- SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns
|
| 7 |
+
BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
|
| 8 |
+
- Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
|
| 9 |
+
post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
|
| 10 |
+
returns dict with "text_labels" and "labels" keys
|
| 11 |
"""
|
| 12 |
import io
|
| 13 |
import torch
|
|
|
|
| 31 |
"""
|
| 32 |
SigLIP2 encoder for frame → embedding and text → embedding.
|
| 33 |
Shared embedding space enables cross-modal similarity search.
|
| 34 |
+
|
| 35 |
+
Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
|
| 36 |
+
Key details:
|
| 37 |
+
- get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
|
| 38 |
+
- .pooler_output gives [B, 1152] pooled representation
|
| 39 |
+
- Text MUST use padding="max_length" (SigLIP training requirement)
|
| 40 |
+
- Use sigmoid (not softmax) for similarity scores
|
| 41 |
"""
|
| 42 |
|
| 43 |
def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
|
|
|
|
| 55 |
|
| 56 |
@torch.no_grad()
|
| 57 |
def embed_frames(self, images: List[Image.Image],
|
| 58 |
+
batch_size: int = 8) -> np.ndarray:
|
| 59 |
"""
|
| 60 |
Embed a list of PIL images into normalized vectors.
|
| 61 |
+
|
| 62 |
Returns:
|
| 63 |
np.ndarray of shape [N, 1152], L2-normalized
|
| 64 |
"""
|
|
|
|
| 67 |
for i in range(0, len(images), batch_size):
|
| 68 |
batch = images[i:i + batch_size]
|
| 69 |
inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
|
| 70 |
+
# get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
|
| 71 |
outputs = self.model.get_image_features(**inputs)
|
| 72 |
embeddings = outputs.pooler_output # [B, 1152]
|
| 73 |
embeddings = F.normalize(embeddings, dim=-1)
|
|
|
|
| 79 |
def embed_texts(self, texts: List[str]) -> np.ndarray:
|
| 80 |
"""
|
| 81 |
Embed text queries into the same space as frames.
|
| 82 |
+
|
| 83 |
Returns:
|
| 84 |
np.ndarray of shape [N, 1152], L2-normalized
|
| 85 |
"""
|
|
|
|
| 91 |
padding="max_length", # CRITICAL: required for SigLIP
|
| 92 |
return_tensors="pt",
|
| 93 |
).to(self.device)
|
| 94 |
+
# get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
|
| 95 |
outputs = self.model.get_text_features(**inputs)
|
| 96 |
embeddings = outputs.pooler_output # [N, 1152]
|
| 97 |
embeddings = F.normalize(embeddings, dim=-1)
|
|
|
|
| 99 |
|
| 100 |
@torch.no_grad()
|
| 101 |
def compute_similarity(self, frame_embeddings: np.ndarray,
|
| 102 |
+
text_embeddings: np.ndarray) -> np.ndarray:
|
| 103 |
"""
|
| 104 |
Compute cosine similarity between frame and text embeddings.
|
| 105 |
Uses sigmoid (SigLIP objective) for per-pair probabilities.
|
| 106 |
+
|
| 107 |
Returns:
|
| 108 |
np.ndarray of shape [num_frames, num_texts]
|
| 109 |
"""
|
|
|
|
| 117 |
"""
|
| 118 |
Grounding DINO for open-vocabulary object detection with attribute queries.
|
| 119 |
Supports complex queries like "person wearing white clothes", "red car", etc.
|
| 120 |
+
|
| 121 |
+
Model: IDEA-Research/grounding-dino-tiny
|
| 122 |
+
Key details (transformers >= 5.x):
|
| 123 |
+
- Processor's __call__ accepts text as str, list[str], or list[list[str]]
|
| 124 |
+
and auto-converts to the "label1 . label2 ." format internally
|
| 125 |
+
- post_process_grounded_object_detection: input_ids is optional,
|
| 126 |
+
uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
|
| 127 |
"""
|
| 128 |
|
| 129 |
def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
|
|
|
|
| 142 |
self.text_threshold = text_threshold
|
| 143 |
print(f" ✅ Grounding DINO loaded")
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
@torch.no_grad()
|
| 146 |
def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
|
| 147 |
"""
|
| 148 |
Detect objects matching the given text labels in an image.
|
| 149 |
+
|
| 150 |
Args:
|
| 151 |
image: PIL Image
|
| 152 |
labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
|
| 153 |
+
|
| 154 |
Returns:
|
| 155 |
List of Detection objects with labels, confidence, and bounding boxes
|
| 156 |
"""
|
| 157 |
+
# Processor accepts list of labels directly and converts to correct format
|
| 158 |
+
# Also accepts the "label1 . label2 ." string format
|
| 159 |
+
text_query = [l.lower().strip() for l in labels]
|
| 160 |
|
| 161 |
inputs = self.processor(
|
| 162 |
images=image,
|
|
|
|
| 166 |
|
| 167 |
outputs = self.model(**inputs)
|
| 168 |
|
| 169 |
+
# transformers >= 5.x: threshold (not box_threshold), input_ids optional
|
| 170 |
+
# target_sizes expects (height, width)
|
| 171 |
results = self.processor.post_process_grounded_object_detection(
|
| 172 |
outputs,
|
|
|
|
| 173 |
threshold=self.box_threshold,
|
| 174 |
text_threshold=self.text_threshold,
|
| 175 |
+
target_sizes=[(image.height, image.width)],
|
| 176 |
)
|
| 177 |
|
| 178 |
detections = []
|
| 179 |
if results:
|
| 180 |
result = results[0]
|
| 181 |
+
# Both "text_labels" and "labels" exist in current API
|
| 182 |
+
label_key = "text_labels" if "text_labels" in result else "labels"
|
| 183 |
for box, score, text_label in zip(
|
| 184 |
+
result["boxes"], result["scores"], result[label_key]
|
| 185 |
):
|
| 186 |
detections.append(Detection(
|
| 187 |
label=text_label,
|