notRaphael
/

video-intelligence-platform

Model card Files Files and versions

xet

Community

notRaphael commited on 30 days ago

Commit

abb7d19

verified ·

1 Parent(s): 75e03dd

fix: update visual_encoders.py - verified API for transformers 5.x (SigLIP2 + Grounding DINO)

Browse files

Files changed (1) hide show

video_intelligence/visual_encoders.py +39 -22

video_intelligence/visual_encoders.py CHANGED Viewed

@@ -1,7 +1,13 @@
 """
 Video Intelligence Platform — Visual Encoders
 SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
-Both run on CPU (no GPU required).
 """
 import io
 import torch
@@ -25,6 +31,13 @@ class SigLIPEncoder:
     """
     SigLIP2 encoder for frame → embedding and text → embedding.
     Shared embedding space enables cross-modal similarity search.
     """
     def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
@@ -42,10 +55,10 @@ class SigLIPEncoder:
     @torch.no_grad()
     def embed_frames(self, images: List[Image.Image],
-                      batch_size: int = 8) -> np.ndarray:
         """
         Embed a list of PIL images into normalized vectors.
         Returns:
             np.ndarray of shape [N, 1152], L2-normalized
         """
@@ -54,6 +67,7 @@ class SigLIPEncoder:
         for i in range(0, len(images), batch_size):
             batch = images[i:i + batch_size]
             inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
             outputs = self.model.get_image_features(**inputs)
             embeddings = outputs.pooler_output  # [B, 1152]
             embeddings = F.normalize(embeddings, dim=-1)
@@ -65,7 +79,7 @@ class SigLIPEncoder:
     def embed_texts(self, texts: List[str]) -> np.ndarray:
         """
         Embed text queries into the same space as frames.
         Returns:
             np.ndarray of shape [N, 1152], L2-normalized
         """
@@ -77,6 +91,7 @@ class SigLIPEncoder:
             padding="max_length",  # CRITICAL: required for SigLIP
             return_tensors="pt",
         ).to(self.device)
         outputs = self.model.get_text_features(**inputs)
         embeddings = outputs.pooler_output  # [N, 1152]
         embeddings = F.normalize(embeddings, dim=-1)
@@ -84,11 +99,11 @@ class SigLIPEncoder:
     @torch.no_grad()
     def compute_similarity(self, frame_embeddings: np.ndarray,
-                            text_embeddings: np.ndarray) -> np.ndarray:
         """
         Compute cosine similarity between frame and text embeddings.
         Uses sigmoid (SigLIP objective) for per-pair probabilities.
         Returns:
             np.ndarray of shape [num_frames, num_texts]
         """
@@ -102,6 +117,13 @@ class GroundingDINODetector:
     """
     Grounding DINO for open-vocabulary object detection with attribute queries.
     Supports complex queries like "person wearing white clothes", "red car", etc.
     """
     def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
@@ -120,29 +142,21 @@ class GroundingDINODetector:
         self.text_threshold = text_threshold
         print(f"   ✅ Grounding DINO loaded")
-    def _format_query(self, labels: List[str]) -> str:
-        """
-        Format labels into Grounding DINO query format.
-        Rules: lowercase, each label ends with ' . '
-        Example: ["person in white", "red car"] → "person in white . red car ."
-        """
-        formatted = " . ".join(l.lower().strip() for l in labels) + " ."
-        return formatted
     @torch.no_grad()
     def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
         """
         Detect objects matching the given text labels in an image.
         Args:
             image: PIL Image
             labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
         Returns:
             List of Detection objects with labels, confidence, and bounding boxes
         """
-        text_query = self._format_query(labels)
         inputs = self.processor(
             images=image,
@@ -152,19 +166,22 @@ class GroundingDINODetector:
         outputs = self.model(**inputs)
         results = self.processor.post_process_grounded_object_detection(
             outputs,
-            inputs.input_ids,
             threshold=self.box_threshold,
             text_threshold=self.text_threshold,
-            target_sizes=[image.size[::-1]],  # (height, width)
         )
         detections = []
         if results:
             result = results[0]
             for box, score, text_label in zip(
-                result["boxes"], result["scores"], result["text_labels"]
             ):
                 detections.append(Detection(
                     label=text_label,

 """
 Video Intelligence Platform — Visual Encoders
 SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
+Verified against transformers >= 5.x API (Apr 2026):
+- SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns
+  BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
+- Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
+  post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
+  returns dict with "text_labels" and "labels" keys
 """
 import io
 import torch
     """
     SigLIP2 encoder for frame → embedding and text → embedding.
     Shared embedding space enables cross-modal similarity search.
+    Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
+    Key details:
+    - get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
+    - .pooler_output gives [B, 1152] pooled representation
+    - Text MUST use padding="max_length" (SigLIP training requirement)
+    - Use sigmoid (not softmax) for similarity scores
     """
     def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
     @torch.no_grad()
     def embed_frames(self, images: List[Image.Image],
+                     batch_size: int = 8) -> np.ndarray:
         """
         Embed a list of PIL images into normalized vectors.
         Returns:
             np.ndarray of shape [N, 1152], L2-normalized
         """
         for i in range(0, len(images), batch_size):
             batch = images[i:i + batch_size]
             inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
+            # get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
             outputs = self.model.get_image_features(**inputs)
             embeddings = outputs.pooler_output  # [B, 1152]
             embeddings = F.normalize(embeddings, dim=-1)
     def embed_texts(self, texts: List[str]) -> np.ndarray:
         """
         Embed text queries into the same space as frames.
         Returns:
             np.ndarray of shape [N, 1152], L2-normalized
         """
             padding="max_length",  # CRITICAL: required for SigLIP
             return_tensors="pt",
         ).to(self.device)
+        # get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
         outputs = self.model.get_text_features(**inputs)
         embeddings = outputs.pooler_output  # [N, 1152]
         embeddings = F.normalize(embeddings, dim=-1)
     @torch.no_grad()
     def compute_similarity(self, frame_embeddings: np.ndarray,
+                           text_embeddings: np.ndarray) -> np.ndarray:
         """
         Compute cosine similarity between frame and text embeddings.
         Uses sigmoid (SigLIP objective) for per-pair probabilities.
         Returns:
             np.ndarray of shape [num_frames, num_texts]
         """
     """
     Grounding DINO for open-vocabulary object detection with attribute queries.
     Supports complex queries like "person wearing white clothes", "red car", etc.
+    Model: IDEA-Research/grounding-dino-tiny
+    Key details (transformers >= 5.x):
+    - Processor's __call__ accepts text as str, list[str], or list[list[str]]
+      and auto-converts to the "label1 . label2 ." format internally
+    - post_process_grounded_object_detection: input_ids is optional,
+      uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
     """
     def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
         self.text_threshold = text_threshold
         print(f"   ✅ Grounding DINO loaded")
     @torch.no_grad()
     def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
         """
         Detect objects matching the given text labels in an image.
         Args:
             image: PIL Image
             labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
         Returns:
             List of Detection objects with labels, confidence, and bounding boxes
         """
+        # Processor accepts list of labels directly and converts to correct format
+        # Also accepts the "label1 . label2 ." string format
+        text_query = [l.lower().strip() for l in labels]
         inputs = self.processor(
             images=image,
         outputs = self.model(**inputs)
+        # transformers >= 5.x: threshold (not box_threshold), input_ids optional
+        # target_sizes expects (height, width)
         results = self.processor.post_process_grounded_object_detection(
             outputs,
             threshold=self.box_threshold,
             text_threshold=self.text_threshold,
+            target_sizes=[(image.height, image.width)],
         )
         detections = []
         if results:
             result = results[0]
+            # Both "text_labels" and "labels" exist in current API
+            label_key = "text_labels" if "text_labels" in result else "labels"
             for box, score, text_label in zip(
+                result["boxes"], result["scores"], result[label_key]
             ):
                 detections.append(Detection(
                     label=text_label,