File size: 8,086 Bytes

"""
Video Intelligence Platform — Visual Encoders
SigLIP2 for frame embeddings + Grounding DINO for attribute detection.

Verified against transformers >= 5.x API (Apr 2026):
- SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns
  BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
- Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
  post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
  returns dict with "text_labels" and "labels" keys
"""
import io
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass


@dataclass
class Detection:
    """A single object detection with attributes."""
    label: str
    confidence: float
    bbox: List[float]  # [x0, y0, x1, y1] in absolute pixels
    timestamp_sec: float = 0.0


class SigLIPEncoder:
    """
    SigLIP2 encoder for frame → embedding and text → embedding.
    Shared embedding space enables cross-modal similarity search.
    
    Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
    Key details:
    - get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
    - .pooler_output gives [B, 1152] pooled representation
    - Text MUST use padding="max_length" (SigLIP training requirement)
    - Use sigmoid (not softmax) for similarity scores
    """

    def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
                 device: str = "cpu"):
        from transformers import AutoModel, AutoProcessor

        print(f"🔄 Loading SigLIP2 ({model_name}) on {device}...")
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(
            model_name, torch_dtype=torch.float32
        ).to(device).eval()
        self.device = device
        self.embedding_dim = 1152
        print(f"   ✅ SigLIP2 loaded (dim={self.embedding_dim})")

    @torch.no_grad()
    def embed_frames(self, images: List[Image.Image],
                     batch_size: int = 8) -> np.ndarray:
        """
        Embed a list of PIL images into normalized vectors.

        Returns:
            np.ndarray of shape [N, 1152], L2-normalized
        """
        all_embeddings = []

        for i in range(0, len(images), batch_size):
            batch = images[i:i + batch_size]
            inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
            # get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
            outputs = self.model.get_image_features(**inputs)
            embeddings = outputs.pooler_output  # [B, 1152]
            embeddings = F.normalize(embeddings, dim=-1)
            all_embeddings.append(embeddings.cpu().numpy())

        return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim))

    @torch.no_grad()
    def embed_texts(self, texts: List[str]) -> np.ndarray:
        """
        Embed text queries into the same space as frames.

        Returns:
            np.ndarray of shape [N, 1152], L2-normalized
        """
        if not texts:
            return np.empty((0, self.embedding_dim))

        inputs = self.processor(
            text=texts,
            padding="max_length",  # CRITICAL: required for SigLIP
            return_tensors="pt",
        ).to(self.device)
        # get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
        outputs = self.model.get_text_features(**inputs)
        embeddings = outputs.pooler_output  # [N, 1152]
        embeddings = F.normalize(embeddings, dim=-1)
        return embeddings.cpu().numpy()

    @torch.no_grad()
    def compute_similarity(self, frame_embeddings: np.ndarray,
                           text_embeddings: np.ndarray) -> np.ndarray:
        """
        Compute cosine similarity between frame and text embeddings.
        Uses sigmoid (SigLIP objective) for per-pair probabilities.

        Returns:
            np.ndarray of shape [num_frames, num_texts]
        """
        # Cosine similarity (embeddings are already L2-normalized)
        similarity = frame_embeddings @ text_embeddings.T
        # SigLIP uses sigmoid, not softmax
        return 1 / (1 + np.exp(-similarity * 5.0))  # approximate logit_scale


class GroundingDINODetector:
    """
    Grounding DINO for open-vocabulary object detection with attribute queries.
    Supports complex queries like "person wearing white clothes", "red car", etc.

    Model: IDEA-Research/grounding-dino-tiny
    Key details (transformers >= 5.x):
    - Processor's __call__ accepts text as str, list[str], or list[list[str]]
      and auto-converts to the "label1 . label2 ." format internally
    - post_process_grounded_object_detection: input_ids is optional,
      uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
    """

    def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
                 device: str = "cpu",
                 box_threshold: float = 0.35,
                 text_threshold: float = 0.25):
        from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

        print(f"🔄 Loading Grounding DINO ({model_name}) on {device}...")
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
            model_name
        ).to(device).eval()
        self.device = device
        self.box_threshold = box_threshold
        self.text_threshold = text_threshold
        print(f"   ✅ Grounding DINO loaded")

    @torch.no_grad()
    def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
        """
        Detect objects matching the given text labels in an image.

        Args:
            image: PIL Image
            labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]

        Returns:
            List of Detection objects with labels, confidence, and bounding boxes
        """
        # Processor accepts list of labels directly and converts to correct format
        # Also accepts the "label1 . label2 ." string format
        text_query = [l.lower().strip() for l in labels]

        inputs = self.processor(
            images=image,
            text=text_query,
            return_tensors="pt",
        ).to(self.device)

        outputs = self.model(**inputs)

        # transformers >= 5.x: threshold (not box_threshold), input_ids optional
        # target_sizes expects (height, width)
        results = self.processor.post_process_grounded_object_detection(
            outputs,
            threshold=self.box_threshold,
            text_threshold=self.text_threshold,
            target_sizes=[(image.height, image.width)],
        )

        detections = []
        if results:
            result = results[0]
            # Both "text_labels" and "labels" exist in current API
            label_key = "text_labels" if "text_labels" in result else "labels"
            for box, score, text_label in zip(
                result["boxes"], result["scores"], result[label_key]
            ):
                detections.append(Detection(
                    label=text_label,
                    confidence=float(score),
                    bbox=[round(x, 2) for x in box.tolist()],
                ))

        return detections

    @torch.no_grad()
    def detect_default_attributes(self, image: Image.Image) -> List[Detection]:
        """
        Run detection with a comprehensive set of default attribute queries.
        This indexes everything visible in the frame.
        """
        default_labels = [
            "person", "car", "truck", "bicycle", "motorcycle",
            "dog", "cat", "bird", "chair", "table",
            "building", "tree", "sign", "phone", "bag",
        ]
        return self.detect(image, default_labels)