""" Video Intelligence Platform — Visual Encoders SigLIP2 for frame embeddings + Grounding DINO for attribute detection. Verified against transformers >= 5.x API (Apr 2026): - SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default) - Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold), returns dict with "text_labels" and "labels" keys """ import io import torch import torch.nn.functional as F import numpy as np from PIL import Image from typing import List, Dict, Optional, Tuple from dataclasses import dataclass @dataclass class Detection: """A single object detection with attributes.""" label: str confidence: float bbox: List[float] # [x0, y0, x1, y1] in absolute pixels timestamp_sec: float = 0.0 class SigLIPEncoder: """ SigLIP2 encoder for frame → embedding and text → embedding. Shared embedding space enables cross-modal similarity search. Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings) Key details: - get_image_features() returns BaseModelOutputWithPooling (return_dict=True default) - .pooler_output gives [B, 1152] pooled representation - Text MUST use padding="max_length" (SigLIP training requirement) - Use sigmoid (not softmax) for similarity scores """ def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384", device: str = "cpu"): from transformers import AutoModel, AutoProcessor print(f"🔄 Loading SigLIP2 ({model_name}) on {device}...") self.processor = AutoProcessor.from_pretrained(model_name) self.model = AutoModel.from_pretrained( model_name, torch_dtype=torch.float32 ).to(device).eval() self.device = device self.embedding_dim = 1152 print(f" ✅ SigLIP2 loaded (dim={self.embedding_dim})") @torch.no_grad() def embed_frames(self, images: List[Image.Image], batch_size: int = 8) -> np.ndarray: """ Embed a list of PIL images into normalized vectors. Returns: np.ndarray of shape [N, 1152], L2-normalized """ all_embeddings = [] for i in range(0, len(images), batch_size): batch = images[i:i + batch_size] inputs = self.processor(images=batch, return_tensors="pt").to(self.device) # get_image_features returns BaseModelOutputWithPooling (return_dict=True by default) outputs = self.model.get_image_features(**inputs) embeddings = outputs.pooler_output # [B, 1152] embeddings = F.normalize(embeddings, dim=-1) all_embeddings.append(embeddings.cpu().numpy()) return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim)) @torch.no_grad() def embed_texts(self, texts: List[str]) -> np.ndarray: """ Embed text queries into the same space as frames. Returns: np.ndarray of shape [N, 1152], L2-normalized """ if not texts: return np.empty((0, self.embedding_dim)) inputs = self.processor( text=texts, padding="max_length", # CRITICAL: required for SigLIP return_tensors="pt", ).to(self.device) # get_text_features returns BaseModelOutputWithPooling (return_dict=True by default) outputs = self.model.get_text_features(**inputs) embeddings = outputs.pooler_output # [N, 1152] embeddings = F.normalize(embeddings, dim=-1) return embeddings.cpu().numpy() @torch.no_grad() def compute_similarity(self, frame_embeddings: np.ndarray, text_embeddings: np.ndarray) -> np.ndarray: """ Compute cosine similarity between frame and text embeddings. Uses sigmoid (SigLIP objective) for per-pair probabilities. Returns: np.ndarray of shape [num_frames, num_texts] """ # Cosine similarity (embeddings are already L2-normalized) similarity = frame_embeddings @ text_embeddings.T # SigLIP uses sigmoid, not softmax return 1 / (1 + np.exp(-similarity * 5.0)) # approximate logit_scale class GroundingDINODetector: """ Grounding DINO for open-vocabulary object detection with attribute queries. Supports complex queries like "person wearing white clothes", "red car", etc. Model: IDEA-Research/grounding-dino-tiny Key details (transformers >= 5.x): - Processor's __call__ accepts text as str, list[str], or list[list[str]] and auto-converts to the "label1 . label2 ." format internally - post_process_grounded_object_detection: input_ids is optional, uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels' """ def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny", device: str = "cpu", box_threshold: float = 0.35, text_threshold: float = 0.25): from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection print(f"🔄 Loading Grounding DINO ({model_name}) on {device}...") self.processor = AutoProcessor.from_pretrained(model_name) self.model = AutoModelForZeroShotObjectDetection.from_pretrained( model_name ).to(device).eval() self.device = device self.box_threshold = box_threshold self.text_threshold = text_threshold print(f" ✅ Grounding DINO loaded") @torch.no_grad() def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]: """ Detect objects matching the given text labels in an image. Args: image: PIL Image labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"] Returns: List of Detection objects with labels, confidence, and bounding boxes """ # Processor accepts list of labels directly and converts to correct format # Also accepts the "label1 . label2 ." string format text_query = [l.lower().strip() for l in labels] inputs = self.processor( images=image, text=text_query, return_tensors="pt", ).to(self.device) outputs = self.model(**inputs) # transformers >= 5.x: threshold (not box_threshold), input_ids optional # target_sizes expects (height, width) results = self.processor.post_process_grounded_object_detection( outputs, threshold=self.box_threshold, text_threshold=self.text_threshold, target_sizes=[(image.height, image.width)], ) detections = [] if results: result = results[0] # Both "text_labels" and "labels" exist in current API label_key = "text_labels" if "text_labels" in result else "labels" for box, score, text_label in zip( result["boxes"], result["scores"], result[label_key] ): detections.append(Detection( label=text_label, confidence=float(score), bbox=[round(x, 2) for x in box.tolist()], )) return detections @torch.no_grad() def detect_default_attributes(self, image: Image.Image) -> List[Detection]: """ Run detection with a comprehensive set of default attribute queries. This indexes everything visible in the frame. """ default_labels = [ "person", "car", "truck", "bicycle", "motorcycle", "dog", "cat", "bird", "chair", "table", "building", "tree", "sign", "phone", "bag", ] return self.detect(image, default_labels)