fix: update visual_encoders.py - verified API for transformers 5.x (SigLIP2 + Grounding DINO)

abb7d19 verified 27 days ago

8.09 kB

	"""
	Video Intelligence Platform — Visual Encoders
	SigLIP2 for frame embeddings + Grounding DINO for attribute detection.

	Verified against transformers >= 5.x API (Apr 2026):
	- SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns
	BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
	- Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
	post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
	returns dict with "text_labels" and "labels" keys
	"""
	import io
	import torch
	import torch.nn.functional as F
	import numpy as np
	from PIL import Image
	from typing import List, Dict, Optional, Tuple
	from dataclasses import dataclass


	@dataclass
	class Detection:
	"""A single object detection with attributes."""
	label: str
	confidence: float
	bbox: List[float] # [x0, y0, x1, y1] in absolute pixels
	timestamp_sec: float = 0.0


	class SigLIPEncoder:
	"""
	SigLIP2 encoder for frame → embedding and text → embedding.
	Shared embedding space enables cross-modal similarity search.

	Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
	Key details:
	- get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
	- .pooler_output gives [B, 1152] pooled representation
	- Text MUST use padding="max_length" (SigLIP training requirement)
	- Use sigmoid (not softmax) for similarity scores
	"""

	def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
	device: str = "cpu"):
	from transformers import AutoModel, AutoProcessor

	print(f"🔄 Loading SigLIP2 ({model_name}) on {device}...")
	self.processor = AutoProcessor.from_pretrained(model_name)
	self.model = AutoModel.from_pretrained(
	model_name, torch_dtype=torch.float32
	).to(device).eval()
	self.device = device
	self.embedding_dim = 1152
	print(f" ✅ SigLIP2 loaded (dim={self.embedding_dim})")

	@torch.no_grad()
	def embed_frames(self, images: List[Image.Image],
	batch_size: int = 8) -> np.ndarray:
	"""
	Embed a list of PIL images into normalized vectors.

	Returns:
	np.ndarray of shape [N, 1152], L2-normalized
	"""
	all_embeddings = []

	for i in range(0, len(images), batch_size):
	batch = images[i:i + batch_size]
	inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
	# get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
	outputs = self.model.get_image_features(**inputs)
	embeddings = outputs.pooler_output # [B, 1152]
	embeddings = F.normalize(embeddings, dim=-1)
	all_embeddings.append(embeddings.cpu().numpy())

	return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim))

	@torch.no_grad()
	def embed_texts(self, texts: List[str]) -> np.ndarray:
	"""
	Embed text queries into the same space as frames.

	Returns:
	np.ndarray of shape [N, 1152], L2-normalized
	"""
	if not texts:
	return np.empty((0, self.embedding_dim))

	inputs = self.processor(
	text=texts,
	padding="max_length", # CRITICAL: required for SigLIP
	return_tensors="pt",
	).to(self.device)
	# get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
	outputs = self.model.get_text_features(**inputs)
	embeddings = outputs.pooler_output # [N, 1152]
	embeddings = F.normalize(embeddings, dim=-1)
	return embeddings.cpu().numpy()

	@torch.no_grad()
	def compute_similarity(self, frame_embeddings: np.ndarray,
	text_embeddings: np.ndarray) -> np.ndarray:
	"""
	Compute cosine similarity between frame and text embeddings.
	Uses sigmoid (SigLIP objective) for per-pair probabilities.

	Returns:
	np.ndarray of shape [num_frames, num_texts]
	"""
	# Cosine similarity (embeddings are already L2-normalized)
	similarity = frame_embeddings @ text_embeddings.T
	# SigLIP uses sigmoid, not softmax
	return 1 / (1 + np.exp(-similarity * 5.0)) # approximate logit_scale


	class GroundingDINODetector:
	"""
	Grounding DINO for open-vocabulary object detection with attribute queries.
	Supports complex queries like "person wearing white clothes", "red car", etc.

	Model: IDEA-Research/grounding-dino-tiny
	Key details (transformers >= 5.x):
	- Processor's __call__ accepts text as str, list[str], or list[list[str]]
	and auto-converts to the "label1 . label2 ." format internally
	- post_process_grounded_object_detection: input_ids is optional,
	uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
	"""

	def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
	device: str = "cpu",
	box_threshold: float = 0.35,
	text_threshold: float = 0.25):
	from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

	print(f"🔄 Loading Grounding DINO ({model_name}) on {device}...")
	self.processor = AutoProcessor.from_pretrained(model_name)
	self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
	model_name
	).to(device).eval()
	self.device = device
	self.box_threshold = box_threshold
	self.text_threshold = text_threshold
	print(f" ✅ Grounding DINO loaded")

	@torch.no_grad()
	def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
	"""
	Detect objects matching the given text labels in an image.

	Args:
	image: PIL Image
	labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]

	Returns:
	List of Detection objects with labels, confidence, and bounding boxes
	"""
	# Processor accepts list of labels directly and converts to correct format
	# Also accepts the "label1 . label2 ." string format
	text_query = [l.lower().strip() for l in labels]

	inputs = self.processor(
	images=image,
	text=text_query,
	return_tensors="pt",
	).to(self.device)

	outputs = self.model(**inputs)

	# transformers >= 5.x: threshold (not box_threshold), input_ids optional
	# target_sizes expects (height, width)
	results = self.processor.post_process_grounded_object_detection(
	outputs,
	threshold=self.box_threshold,
	text_threshold=self.text_threshold,
	target_sizes=[(image.height, image.width)],
	)

	detections = []
	if results:
	result = results[0]
	# Both "text_labels" and "labels" exist in current API
	label_key = "text_labels" if "text_labels" in result else "labels"
	for box, score, text_label in zip(
	result["boxes"], result["scores"], result[label_key]
	):
	detections.append(Detection(
	label=text_label,
	confidence=float(score),
	bbox=[round(x, 2) for x in box.tolist()],
	))

	return detections

	@torch.no_grad()
	def detect_default_attributes(self, image: Image.Image) -> List[Detection]:
	"""
	Run detection with a comprehensive set of default attribute queries.
	This indexes everything visible in the frame.
	"""
	default_labels = [
	"person", "car", "truck", "bicycle", "motorcycle",
	"dog", "cat", "bird", "chair", "table",
	"building", "tree", "sign", "phone", "bag",
	]
	return self.detect(image, default_labels)