Spaces:

BiasLab2025
/

perception

Running

App Files Files Community

Zhen Ye commited on 9 days ago

Commit

af0f84f

1 Parent(s): 8bc4370

fix: replaced batch with sequential

Browse files

Files changed (1) hide show

models/detectors/grounding_dino.py +4 -46

models/detectors/grounding_dino.py CHANGED Viewed

@@ -103,49 +103,7 @@ class GroundingDinoDetector(ObjectDetector):
         return self._parse_single_result(processed_list[0])
     def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
-        if not frames:
-            return []
-        import cv2
-        from types import SimpleNamespace
-        frames_rgb = [cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames]
-        prompt = self._build_prompt(queries)
-        # 1. Preprocess each frame individually (avoids batch processor issues)
-        individual_inputs = []
-        for frame in frames_rgb:
-            inp = self.processor(images=frame, text=prompt, return_tensors="pt")
-            individual_inputs.append(inp)
-        # 2. Stack into batch for GPU forward pass
-        #    All frames are from the same video (same resolution), so tensor shapes match.
-        #    If they don't (edge case), fall back to sequential predict().
-        batch_keys = list(individual_inputs[0].keys())
-        try:
-            batch_inputs = {}
-            for key in batch_keys:
-                batch_inputs[key] = torch.cat(
-                    [inp[key] for inp in individual_inputs], dim=0
-                ).to(self.device)
-        except RuntimeError:
-            # Shape mismatch (different resolutions) — fall back to sequential
-            return [self.predict(f, queries) for f in frames]
-        # 3. Batched forward pass (GPU-efficient)
-        with torch.no_grad():
-            outputs = self.model(**batch_inputs)
-        # 4. Per-frame post-processing using individual (non-batched) input_ids
-        single_input_ids = individual_inputs[0]["input_ids"].to(self.device)
-        results = []
-        for i in range(len(frames)):
-            frame_outputs = SimpleNamespace(
-                logits=outputs.logits[i : i + 1],
-                pred_boxes=outputs.pred_boxes[i : i + 1],
-            )
-            target_sizes = torch.tensor([frames[i].shape[:2]], device=self.device)
-            processed = self._post_process(frame_outputs, single_input_ids, target_sizes)
-            results.append(self._parse_single_result(processed[0]))
-        return results

         return self._parse_single_result(processed_list[0])
     def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        # Grounding DINO's forward pass produces degraded/zero logits at batch_size > 1
+        # (known HF issue #32206, #34346). Fall back to sequential single-frame inference
+        # which is the only path proven to produce correct detections.
+        return [self.predict(f, queries) for f in frames]