fix: update visual_encoders.py - verified API for transformers 5.x (SigLIP2 + Grounding DINO)
abb7d19 verified | """ | |
| Video Intelligence Platform — Visual Encoders | |
| SigLIP2 for frame embeddings + Grounding DINO for attribute detection. | |
| Verified against transformers >= 5.x API (Apr 2026): | |
| - SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns | |
| BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default) | |
| - Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor | |
| post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold), | |
| returns dict with "text_labels" and "labels" keys | |
| """ | |
| import io | |
| import torch | |
| import torch.nn.functional as F | |
| import numpy as np | |
| from PIL import Image | |
| from typing import List, Dict, Optional, Tuple | |
| from dataclasses import dataclass | |
| class Detection: | |
| """A single object detection with attributes.""" | |
| label: str | |
| confidence: float | |
| bbox: List[float] # [x0, y0, x1, y1] in absolute pixels | |
| timestamp_sec: float = 0.0 | |
| class SigLIPEncoder: | |
| """ | |
| SigLIP2 encoder for frame → embedding and text → embedding. | |
| Shared embedding space enables cross-modal similarity search. | |
| Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings) | |
| Key details: | |
| - get_image_features() returns BaseModelOutputWithPooling (return_dict=True default) | |
| - .pooler_output gives [B, 1152] pooled representation | |
| - Text MUST use padding="max_length" (SigLIP training requirement) | |
| - Use sigmoid (not softmax) for similarity scores | |
| """ | |
| def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384", | |
| device: str = "cpu"): | |
| from transformers import AutoModel, AutoProcessor | |
| print(f"🔄 Loading SigLIP2 ({model_name}) on {device}...") | |
| self.processor = AutoProcessor.from_pretrained(model_name) | |
| self.model = AutoModel.from_pretrained( | |
| model_name, torch_dtype=torch.float32 | |
| ).to(device).eval() | |
| self.device = device | |
| self.embedding_dim = 1152 | |
| print(f" ✅ SigLIP2 loaded (dim={self.embedding_dim})") | |
| def embed_frames(self, images: List[Image.Image], | |
| batch_size: int = 8) -> np.ndarray: | |
| """ | |
| Embed a list of PIL images into normalized vectors. | |
| Returns: | |
| np.ndarray of shape [N, 1152], L2-normalized | |
| """ | |
| all_embeddings = [] | |
| for i in range(0, len(images), batch_size): | |
| batch = images[i:i + batch_size] | |
| inputs = self.processor(images=batch, return_tensors="pt").to(self.device) | |
| # get_image_features returns BaseModelOutputWithPooling (return_dict=True by default) | |
| outputs = self.model.get_image_features(**inputs) | |
| embeddings = outputs.pooler_output # [B, 1152] | |
| embeddings = F.normalize(embeddings, dim=-1) | |
| all_embeddings.append(embeddings.cpu().numpy()) | |
| return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim)) | |
| def embed_texts(self, texts: List[str]) -> np.ndarray: | |
| """ | |
| Embed text queries into the same space as frames. | |
| Returns: | |
| np.ndarray of shape [N, 1152], L2-normalized | |
| """ | |
| if not texts: | |
| return np.empty((0, self.embedding_dim)) | |
| inputs = self.processor( | |
| text=texts, | |
| padding="max_length", # CRITICAL: required for SigLIP | |
| return_tensors="pt", | |
| ).to(self.device) | |
| # get_text_features returns BaseModelOutputWithPooling (return_dict=True by default) | |
| outputs = self.model.get_text_features(**inputs) | |
| embeddings = outputs.pooler_output # [N, 1152] | |
| embeddings = F.normalize(embeddings, dim=-1) | |
| return embeddings.cpu().numpy() | |
| def compute_similarity(self, frame_embeddings: np.ndarray, | |
| text_embeddings: np.ndarray) -> np.ndarray: | |
| """ | |
| Compute cosine similarity between frame and text embeddings. | |
| Uses sigmoid (SigLIP objective) for per-pair probabilities. | |
| Returns: | |
| np.ndarray of shape [num_frames, num_texts] | |
| """ | |
| # Cosine similarity (embeddings are already L2-normalized) | |
| similarity = frame_embeddings @ text_embeddings.T | |
| # SigLIP uses sigmoid, not softmax | |
| return 1 / (1 + np.exp(-similarity * 5.0)) # approximate logit_scale | |
| class GroundingDINODetector: | |
| """ | |
| Grounding DINO for open-vocabulary object detection with attribute queries. | |
| Supports complex queries like "person wearing white clothes", "red car", etc. | |
| Model: IDEA-Research/grounding-dino-tiny | |
| Key details (transformers >= 5.x): | |
| - Processor's __call__ accepts text as str, list[str], or list[list[str]] | |
| and auto-converts to the "label1 . label2 ." format internally | |
| - post_process_grounded_object_detection: input_ids is optional, | |
| uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels' | |
| """ | |
| def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny", | |
| device: str = "cpu", | |
| box_threshold: float = 0.35, | |
| text_threshold: float = 0.25): | |
| from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection | |
| print(f"🔄 Loading Grounding DINO ({model_name}) on {device}...") | |
| self.processor = AutoProcessor.from_pretrained(model_name) | |
| self.model = AutoModelForZeroShotObjectDetection.from_pretrained( | |
| model_name | |
| ).to(device).eval() | |
| self.device = device | |
| self.box_threshold = box_threshold | |
| self.text_threshold = text_threshold | |
| print(f" ✅ Grounding DINO loaded") | |
| def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]: | |
| """ | |
| Detect objects matching the given text labels in an image. | |
| Args: | |
| image: PIL Image | |
| labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"] | |
| Returns: | |
| List of Detection objects with labels, confidence, and bounding boxes | |
| """ | |
| # Processor accepts list of labels directly and converts to correct format | |
| # Also accepts the "label1 . label2 ." string format | |
| text_query = [l.lower().strip() for l in labels] | |
| inputs = self.processor( | |
| images=image, | |
| text=text_query, | |
| return_tensors="pt", | |
| ).to(self.device) | |
| outputs = self.model(**inputs) | |
| # transformers >= 5.x: threshold (not box_threshold), input_ids optional | |
| # target_sizes expects (height, width) | |
| results = self.processor.post_process_grounded_object_detection( | |
| outputs, | |
| threshold=self.box_threshold, | |
| text_threshold=self.text_threshold, | |
| target_sizes=[(image.height, image.width)], | |
| ) | |
| detections = [] | |
| if results: | |
| result = results[0] | |
| # Both "text_labels" and "labels" exist in current API | |
| label_key = "text_labels" if "text_labels" in result else "labels" | |
| for box, score, text_label in zip( | |
| result["boxes"], result["scores"], result[label_key] | |
| ): | |
| detections.append(Detection( | |
| label=text_label, | |
| confidence=float(score), | |
| bbox=[round(x, 2) for x in box.tolist()], | |
| )) | |
| return detections | |
| def detect_default_attributes(self, image: Image.Image) -> List[Detection]: | |
| """ | |
| Run detection with a comprehensive set of default attribute queries. | |
| This indexes everything visible in the frame. | |
| """ | |
| default_labels = [ | |
| "person", "car", "truck", "bicycle", "motorcycle", | |
| "dog", "cat", "bird", "chair", "table", | |
| "building", "tree", "sign", "phone", "bag", | |
| ] | |
| return self.detect(image, default_labels) | |