notRaphael commited on
Commit
abb7d19
·
verified ·
1 Parent(s): 75e03dd

fix: update visual_encoders.py - verified API for transformers 5.x (SigLIP2 + Grounding DINO)

Browse files
Files changed (1) hide show
  1. video_intelligence/visual_encoders.py +39 -22
video_intelligence/visual_encoders.py CHANGED
@@ -1,7 +1,13 @@
1
  """
2
  Video Intelligence Platform — Visual Encoders
3
  SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
4
- Both run on CPU (no GPU required).
 
 
 
 
 
 
5
  """
6
  import io
7
  import torch
@@ -25,6 +31,13 @@ class SigLIPEncoder:
25
  """
26
  SigLIP2 encoder for frame → embedding and text → embedding.
27
  Shared embedding space enables cross-modal similarity search.
 
 
 
 
 
 
 
28
  """
29
 
30
  def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
@@ -42,10 +55,10 @@ class SigLIPEncoder:
42
 
43
  @torch.no_grad()
44
  def embed_frames(self, images: List[Image.Image],
45
- batch_size: int = 8) -> np.ndarray:
46
  """
47
  Embed a list of PIL images into normalized vectors.
48
-
49
  Returns:
50
  np.ndarray of shape [N, 1152], L2-normalized
51
  """
@@ -54,6 +67,7 @@ class SigLIPEncoder:
54
  for i in range(0, len(images), batch_size):
55
  batch = images[i:i + batch_size]
56
  inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
 
57
  outputs = self.model.get_image_features(**inputs)
58
  embeddings = outputs.pooler_output # [B, 1152]
59
  embeddings = F.normalize(embeddings, dim=-1)
@@ -65,7 +79,7 @@ class SigLIPEncoder:
65
  def embed_texts(self, texts: List[str]) -> np.ndarray:
66
  """
67
  Embed text queries into the same space as frames.
68
-
69
  Returns:
70
  np.ndarray of shape [N, 1152], L2-normalized
71
  """
@@ -77,6 +91,7 @@ class SigLIPEncoder:
77
  padding="max_length", # CRITICAL: required for SigLIP
78
  return_tensors="pt",
79
  ).to(self.device)
 
80
  outputs = self.model.get_text_features(**inputs)
81
  embeddings = outputs.pooler_output # [N, 1152]
82
  embeddings = F.normalize(embeddings, dim=-1)
@@ -84,11 +99,11 @@ class SigLIPEncoder:
84
 
85
  @torch.no_grad()
86
  def compute_similarity(self, frame_embeddings: np.ndarray,
87
- text_embeddings: np.ndarray) -> np.ndarray:
88
  """
89
  Compute cosine similarity between frame and text embeddings.
90
  Uses sigmoid (SigLIP objective) for per-pair probabilities.
91
-
92
  Returns:
93
  np.ndarray of shape [num_frames, num_texts]
94
  """
@@ -102,6 +117,13 @@ class GroundingDINODetector:
102
  """
103
  Grounding DINO for open-vocabulary object detection with attribute queries.
104
  Supports complex queries like "person wearing white clothes", "red car", etc.
 
 
 
 
 
 
 
105
  """
106
 
107
  def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
@@ -120,29 +142,21 @@ class GroundingDINODetector:
120
  self.text_threshold = text_threshold
121
  print(f" ✅ Grounding DINO loaded")
122
 
123
- def _format_query(self, labels: List[str]) -> str:
124
- """
125
- Format labels into Grounding DINO query format.
126
- Rules: lowercase, each label ends with ' . '
127
-
128
- Example: ["person in white", "red car"] → "person in white . red car ."
129
- """
130
- formatted = " . ".join(l.lower().strip() for l in labels) + " ."
131
- return formatted
132
-
133
  @torch.no_grad()
134
  def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
135
  """
136
  Detect objects matching the given text labels in an image.
137
-
138
  Args:
139
  image: PIL Image
140
  labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
141
-
142
  Returns:
143
  List of Detection objects with labels, confidence, and bounding boxes
144
  """
145
- text_query = self._format_query(labels)
 
 
146
 
147
  inputs = self.processor(
148
  images=image,
@@ -152,19 +166,22 @@ class GroundingDINODetector:
152
 
153
  outputs = self.model(**inputs)
154
 
 
 
155
  results = self.processor.post_process_grounded_object_detection(
156
  outputs,
157
- inputs.input_ids,
158
  threshold=self.box_threshold,
159
  text_threshold=self.text_threshold,
160
- target_sizes=[image.size[::-1]], # (height, width)
161
  )
162
 
163
  detections = []
164
  if results:
165
  result = results[0]
 
 
166
  for box, score, text_label in zip(
167
- result["boxes"], result["scores"], result["text_labels"]
168
  ):
169
  detections.append(Detection(
170
  label=text_label,
 
1
  """
2
  Video Intelligence Platform — Visual Encoders
3
  SigLIP2 for frame embeddings + Grounding DINO for attribute detection.
4
+
5
+ Verified against transformers >= 5.x API (Apr 2026):
6
+ - SigLIP2: AutoModel + AutoProcessor → SiglipModel, get_image_features returns
7
+ BaseModelOutputWithPooling (has .pooler_output when return_dict=True, which is default)
8
+ - Grounding DINO: AutoModelForZeroShotObjectDetection + AutoProcessor
9
+ post_process_grounded_object_detection accepts input_ids (optional), threshold (not box_threshold),
10
+ returns dict with "text_labels" and "labels" keys
11
  """
12
  import io
13
  import torch
 
31
  """
32
  SigLIP2 encoder for frame → embedding and text → embedding.
33
  Shared embedding space enables cross-modal similarity search.
34
+
35
+ Model: google/siglip2-so400m-patch14-384 (1152-dim embeddings)
36
+ Key details:
37
+ - get_image_features() returns BaseModelOutputWithPooling (return_dict=True default)
38
+ - .pooler_output gives [B, 1152] pooled representation
39
+ - Text MUST use padding="max_length" (SigLIP training requirement)
40
+ - Use sigmoid (not softmax) for similarity scores
41
  """
42
 
43
  def __init__(self, model_name: str = "google/siglip2-so400m-patch14-384",
 
55
 
56
  @torch.no_grad()
57
  def embed_frames(self, images: List[Image.Image],
58
+ batch_size: int = 8) -> np.ndarray:
59
  """
60
  Embed a list of PIL images into normalized vectors.
61
+
62
  Returns:
63
  np.ndarray of shape [N, 1152], L2-normalized
64
  """
 
67
  for i in range(0, len(images), batch_size):
68
  batch = images[i:i + batch_size]
69
  inputs = self.processor(images=batch, return_tensors="pt").to(self.device)
70
+ # get_image_features returns BaseModelOutputWithPooling (return_dict=True by default)
71
  outputs = self.model.get_image_features(**inputs)
72
  embeddings = outputs.pooler_output # [B, 1152]
73
  embeddings = F.normalize(embeddings, dim=-1)
 
79
  def embed_texts(self, texts: List[str]) -> np.ndarray:
80
  """
81
  Embed text queries into the same space as frames.
82
+
83
  Returns:
84
  np.ndarray of shape [N, 1152], L2-normalized
85
  """
 
91
  padding="max_length", # CRITICAL: required for SigLIP
92
  return_tensors="pt",
93
  ).to(self.device)
94
+ # get_text_features returns BaseModelOutputWithPooling (return_dict=True by default)
95
  outputs = self.model.get_text_features(**inputs)
96
  embeddings = outputs.pooler_output # [N, 1152]
97
  embeddings = F.normalize(embeddings, dim=-1)
 
99
 
100
  @torch.no_grad()
101
  def compute_similarity(self, frame_embeddings: np.ndarray,
102
+ text_embeddings: np.ndarray) -> np.ndarray:
103
  """
104
  Compute cosine similarity between frame and text embeddings.
105
  Uses sigmoid (SigLIP objective) for per-pair probabilities.
106
+
107
  Returns:
108
  np.ndarray of shape [num_frames, num_texts]
109
  """
 
117
  """
118
  Grounding DINO for open-vocabulary object detection with attribute queries.
119
  Supports complex queries like "person wearing white clothes", "red car", etc.
120
+
121
+ Model: IDEA-Research/grounding-dino-tiny
122
+ Key details (transformers >= 5.x):
123
+ - Processor's __call__ accepts text as str, list[str], or list[list[str]]
124
+ and auto-converts to the "label1 . label2 ." format internally
125
+ - post_process_grounded_object_detection: input_ids is optional,
126
+ uses 'threshold' (not 'box_threshold'), returns both 'text_labels' and 'labels'
127
  """
128
 
129
  def __init__(self, model_name: str = "IDEA-Research/grounding-dino-tiny",
 
142
  self.text_threshold = text_threshold
143
  print(f" ✅ Grounding DINO loaded")
144
 
 
 
 
 
 
 
 
 
 
 
145
  @torch.no_grad()
146
  def detect(self, image: Image.Image, labels: List[str]) -> List[Detection]:
147
  """
148
  Detect objects matching the given text labels in an image.
149
+
150
  Args:
151
  image: PIL Image
152
  labels: List of text descriptions, e.g. ["person wearing white clothes", "red car"]
153
+
154
  Returns:
155
  List of Detection objects with labels, confidence, and bounding boxes
156
  """
157
+ # Processor accepts list of labels directly and converts to correct format
158
+ # Also accepts the "label1 . label2 ." string format
159
+ text_query = [l.lower().strip() for l in labels]
160
 
161
  inputs = self.processor(
162
  images=image,
 
166
 
167
  outputs = self.model(**inputs)
168
 
169
+ # transformers >= 5.x: threshold (not box_threshold), input_ids optional
170
+ # target_sizes expects (height, width)
171
  results = self.processor.post_process_grounded_object_detection(
172
  outputs,
 
173
  threshold=self.box_threshold,
174
  text_threshold=self.text_threshold,
175
+ target_sizes=[(image.height, image.width)],
176
  )
177
 
178
  detections = []
179
  if results:
180
  result = results[0]
181
+ # Both "text_labels" and "labels" exist in current API
182
+ label_key = "text_labels" if "text_labels" in result else "labels"
183
  for box, score, text_label in zip(
184
+ result["boxes"], result["scores"], result[label_key]
185
  ):
186
  detections.append(Detection(
187
  label=text_label,