notRaphael
/

video-intelligence-platform

Model card Files Files and versions

xet

Community

notRaphael commited on 27 days ago

Commit

232f64f

verified ·

1 Parent(s): 7335d00

Add visual encoders (SigLIP2 + Grounding DINO)

Browse files

Files changed (1) hide show

video_intelligence/visual_encoders.py +188 -0

video_intelligence/visual_encoders.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+Video Intelligence Platform — Visual Encoders
+SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
+Both run on CPU (no GPU required).
+"""
+import io
+import torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass
+@dataclass
+class Detection:
+    """A single object detection with attributes."""
+    label: str
+    confidence: float
+    bbox: List[float]  # [x0, y0, x1, y1] in absolute pixels
+    timestamp_sec: float = 0.0
+class SigLIPEncoder:
+    """
+    SigLIP2 encoder for frame → embedding and text → embedding.
+    Shared embedding space enables cross-modal similarity search.
+    """
+    def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
+                 device: str = "cpu"):
+        from transformers import AutoModel, AutoProcessor
+        print(f"🔄 Loading SigLIP2 ({model_name}) on {device}...")
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.float32
+        ).to(device).eval()
+        self.device = device
+        self.embedding_dim = 1152
+        print(f"   ✅ SigLIP2 loaded (dim={self.embedding_dim})")
+    @torch.no_grad()
+    def embed_frames(self, images: List[Image.Image],
+                      batch_size: int = 8) -> np.ndarray:
+        """
+        Embed a list of PIL images into normalized vectors.
+        Returns:
+            np.ndarray of shape [N, 1152], L2-normalized
+        """
+        all_embeddings = []
+        for i in range(0, len(images), batch_size):
+            batch = images[i:i + batch_size]
+            inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
+            outputs = self.model.get_image_features(**inputs)
+            embeddings = outputs.pooler_output  # [B, 1152]
+            embeddings = F.normalize(embeddings, dim=-1)
+            all_embeddings.append(embeddings.cpu().numpy())
+        return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, self.embedding_dim))
+    @torch.no_grad()
+    def embed_texts(self, texts: List[str]) -> np.ndarray:
+        """
+        Embed text queries into the same space as frames.
+        Returns:
+            np.ndarray of shape [N, 1152], L2-normalized
+        """
+        if not texts:
+            return np.empty((0, self.embedding_dim))
+        inputs = self.processor(
+            text=texts,
+            padding="max_length",  # CRITICAL: required for SigLIP
+            return_tensors="pt",
+        ).to(self.device)
+        outputs = self.model.get_text_features(**inputs)
+        embeddings = outputs.pooler_output  # [N, 1152]
+        embeddings = F.normalize(embeddings, dim=-1)
+        return embeddings.cpu().numpy()
+    @torch.no_grad()
+    def compute_similarity(self, frame_embeddings: np.ndarray,
+                            text_embeddings: np.ndarray) -> np.ndarray:
+        """
+        Compute cosine similarity between frame and text embeddings.
+        Uses sigmoid (SigLIP objective) for per-pair probabilities.
+        Returns:
+            np.ndarray of shape [num_frames, num_texts]
+        """
+        # Cosine similarity (embeddings are already L2-normalized)
+        similarity = frame_embeddings @ text_embeddings.T
+        # SigLIP uses sigmoid, not softmax
+        return 1 / (1 + np.exp(-similarity * 5.0))  # approximate logit_scale
+class GroundingDINODetector:
+    """
+    Grounding DINO for open-vocabulary object detection with attribute queries.
+    Supports complex queries like "person wearing white clothes", "red car", etc.
+    """
+    def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
+                 device: str = "cpu",
+                 box_threshold: float = 0.35,
+                 text_threshold: float = 0.25):
+        from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+        print(f"🔄 Loading Grounding DINO ({model_name}) on {device}...")
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
+            model_name
+        ).to(device).eval()
+        self.device = device
+        self.box_threshold = box_threshold
+        self.text_threshold = text_threshold
+        print(f"   ✅ Grounding DINO loaded")
+    def _format_query(self, labels: List[str]) -> str:
+        """
+        Format labels into Grounding DINO query format.
+        Rules: lowercase, each label ends with ' . '
+        Example: ["person in white", "red car"] → "person in white . red car ."
+        """
+        formatted = " . ".join(l.lower().strip() for l in labels) + " ."
+        return formatted
+    @torch.no_grad()
+    def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
+        """
+        Detect objects matching the given text labels in an image.
+        Args:
+            image: PIL Image
+            labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
+        Returns:
+            List of Detection objects with labels, confidence, and bounding boxes
+        """
+        text_query = self._format_query(labels)
+        inputs = self.processor(
+            images=image,
+            text=text_query,
+            return_tensors="pt",
+        ).to(self.device)
+        outputs = self.model(**inputs)
+        results = self.processor.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            threshold=self.box_threshold,
+            text_threshold=self.text_threshold,
+            target_sizes=[image.size[::-1]],  # (height, width)
+        )
+        detections = []
+        if results:
+            result = results[0]
+            for box, score, text_label in zip(
+                result["boxes"], result["scores"], result["text_labels"]
+            ):
+                detections.append(Detection(
+                    label=text_label,
+                    confidence=float(score),
+                    bbox=[round(x, 2) for x in box.tolist()],
+                ))
+        return detections
+    @torch.no_grad()
+    def detect_default_attributes(self, image: Image.Image) -> List[Detection]:
+        """
+        Run detection with a comprehensive set of default attribute queries.
+        This indexes everything visible in the frame.
+        """
+        default_labels = [
+            "person", "car", "truck", "bicycle", "motorcycle",
+            "dog", "cat", "bird", "chair", "table",
+            "building", "tree", "sign", "phone", "bag",
+        ]
+        return self.detect(image, default_labels)