File size: 8,086 Bytes
232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f abb7d19 232f64f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | """
Video Intelligence Platform β Visual Encoders
SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
Verified against transformers >= 5.x API (Apr 2026):
- SigLIP2: AutoModel + AutoProcessor β SiglipModel, get_image_features returns
BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
- Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
returns dict with "text_labels" and "labels" keys
"""
import io
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
@dataclass
class Detection:
"""A single object detection with attributes."""
label: str
confidence: float
bbox: List[float] # [x0, y0, x1, y1] in absolute pixels
timestamp_sec: float = 0.0
class SigLIPEncoder:
"""
SigLIP2 encoder for frame β embedding and text β embedding.
Shared embedding space enables cross-modal similarity search.
Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
Key details:
- get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
- .pooler_output gives [B, 1152] pooled representation
- Text MUST use padding="max_length" (SigLIP training requirement)
- Use sigmoid (not softmax) for similarity scores
"""
def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
device: str = "cpu"):
from transformers import AutoModel, AutoProcessor
print(f"π Loading SigLIP2 ({model_name}) on {device}...")
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.float32
).to(device).eval()
self.device = device
self.embedding_dim = 1152
print(f" β
SigLIP2 loaded (dim={self.embedding_dim})")
@torch.no_grad()
def embed_frames(self, images: List[Image.Image],
batch_size: int = 8) -> np.ndarray:
"""
Embed a list of PIL images into normalized vectors.
Returns:
np.ndarray of shape [N, 1152], L2-normalized
"""
all_embeddings = []
for i in range(0, len(images), batch_size):
batch = images[i:i + batch_size]
inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
# get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
outputs = self.model.get_image_features(**inputs)
embeddings = outputs.pooler_output # [B, 1152]
embeddings = F.normalize(embeddings, dim=-1)
all_embeddings.append(embeddings.cpu().numpy())
return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim))
@torch.no_grad()
def embed_texts(self, texts: List[str]) -> np.ndarray:
"""
Embed text queries into the same space as frames.
Returns:
np.ndarray of shape [N, 1152], L2-normalized
"""
if not texts:
return np.empty((0, self.embedding_dim))
inputs = self.processor(
text=texts,
padding="max_length", # CRITICAL: required for SigLIP
return_tensors="pt",
).to(self.device)
# get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
outputs = self.model.get_text_features(**inputs)
embeddings = outputs.pooler_output # [N, 1152]
embeddings = F.normalize(embeddings, dim=-1)
return embeddings.cpu().numpy()
@torch.no_grad()
def compute_similarity(self, frame_embeddings: np.ndarray,
text_embeddings: np.ndarray) -> np.ndarray:
"""
Compute cosine similarity between frame and text embeddings.
Uses sigmoid (SigLIP objective) for per-pair probabilities.
Returns:
np.ndarray of shape [num_frames, num_texts]
"""
# Cosine similarity (embeddings are already L2-normalized)
similarity = frame_embeddings @ text_embeddings.T
# SigLIP uses sigmoid, not softmax
return 1 / (1 + np.exp(-similarity * 5.0)) # approximate logit_scale
class GroundingDINODetector:
"""
Grounding DINO for open-vocabulary object detection with attribute queries.
Supports complex queries like "person wearing white clothes", "red car", etc.
Model: IDEA-Research/grounding-dino-tiny
Key details (transformers >= 5.x):
- Processor's __call__ accepts text as str, list[str], or list[list[str]]
and auto-converts to the "label1 . label2 ." format internally
- post_process_grounded_object_detection: input_ids is optional,
uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
"""
def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
device: str = "cpu",
box_threshold: float = 0.35,
text_threshold: float = 0.25):
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
print(f"π Loading Grounding DINO ({model_name}) on {device}...")
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
model_name
).to(device).eval()
self.device = device
self.box_threshold = box_threshold
self.text_threshold = text_threshold
print(f" β
Grounding DINO loaded")
@torch.no_grad()
def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
"""
Detect objects matching the given text labels in an image.
Args:
image: PIL Image
labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
Returns:
List of Detection objects with labels, confidence, and bounding boxes
"""
# Processor accepts list of labels directly and converts to correct format
# Also accepts the "label1 . label2 ." string format
text_query = [l.lower().strip() for l in labels]
inputs = self.processor(
images=image,
text=text_query,
return_tensors="pt",
).to(self.device)
outputs = self.model(**inputs)
# transformers >= 5.x: threshold (not box_threshold), input_ids optional
# target_sizes expects (height, width)
results = self.processor.post_process_grounded_object_detection(
outputs,
threshold=self.box_threshold,
text_threshold=self.text_threshold,
target_sizes=[(image.height, image.width)],
)
detections = []
if results:
result = results[0]
# Both "text_labels" and "labels" exist in current API
label_key = "text_labels" if "text_labels" in result else "labels"
for box, score, text_label in zip(
result["boxes"], result["scores"], result[label_key]
):
detections.append(Detection(
label=text_label,
confidence=float(score),
bbox=[round(x, 2) for x in box.tolist()],
))
return detections
@torch.no_grad()
def detect_default_attributes(self, image: Image.Image) -> List[Detection]:
"""
Run detection with a comprehensive set of default attribute queries.
This indexes everything visible in the frame.
"""
default_labels = [
"person", "car", "truck", "bicycle", "motorcycle",
"dog", "cat", "bird", "chair", "table",
"building", "tree", "sign", "phone", "bag",
]
return self.detect(image, default_labels)
|