Spaces:

BiasLab2025
/

perception

Paused

App Files Files Community

Zhen Ye commited on 15 days ago

Commit

3de3df3

1 Parent(s): 3d32b4a

Fix SAM3 batch prediction shape mismatch and add InternVL2 to frontend

Browse files

Files changed (2) hide show

LaserPerception/LaserPerception.html +3 -0
models/segmenters/sam3.py +100 -47

LaserPerception/LaserPerception.html CHANGED Viewed

@@ -81,6 +81,9 @@
                 <optgroup label="Drone Detection Models">
                   <option value="drone_yolo" data-kind="drone">Drone</option>
                 </optgroup>
               </select>
             </div>
             <div>

                 <optgroup label="Drone Detection Models">
                   <option value="drone_yolo" data-kind="drone">Drone</option>
                 </optgroup>
+                <optgroup label="Vision-Language Models">
+                  <option value="internvl2_military" data-kind="object">InternVL2 (Military)</option>
+                </optgroup>
               </select>
             </div>
             <div>

models/segmenters/sam3.py CHANGED Viewed

@@ -87,6 +87,95 @@ class SAM3Segmenter(Segmenter):
             boxes=boxes_array,
         )
     def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
         """
         Run SAM3 segmentation on a frame.
@@ -115,51 +204,8 @@ class SAM3Segmenter(Segmenter):
             images=pil_image, text=text_prompts, return_tensors="pt"
         ).to(self.device)
-        # Handle batch size mismatch between image (1) and prompts (N) structure
-        pixel_values = inputs.get("pixel_values")
-        input_ids = inputs.get("input_ids")
-        if (
-            pixel_values is not None
-            and input_ids is not None
-            and pixel_values.shape[0] == 1
-            and input_ids.shape[0] > 1
-        ):
-            target_batch_size = input_ids.shape[0]
-            logging.debug(f"Expanding SAM3 vision inputs from 1 to {target_batch_size} using embeddings reuse.")
-            # 1. Compute vision embeddings once
-            with torch.no_grad():
-                vision_outputs = self.model.get_vision_features(
-                    pixel_values=pixel_values
-                )
-            # 2. Expand vision embeddings
-            # vision_outputs is a ModelOutput (dict-like)
-            for key, value in vision_outputs.items():
-                if isinstance(value, torch.Tensor):
-                    if value.shape[0] == 1:
-                        vision_outputs[key] = value.repeat(target_batch_size, *([1]*(value.dim()-1)))
-                elif isinstance(value, (list, tuple)):
-                    new_list = []
-                    for v in value:
-                        if isinstance(v, torch.Tensor) and v.shape[0] == 1:
-                            new_list.append(v.repeat(target_batch_size, *([1]*(v.dim()-1))))
-                        else:
-                            new_list.append(v)
-                    # Preserve type (tuple vs list)
-                    vision_outputs[key] = type(value)(new_list)
-            # 3. Update inputs for model call
-            inputs["vision_embeds"] = vision_outputs
-            del inputs["pixel_values"] # Mutually exclusive with vision_embeds
-            # 4. Expand other metadata
-            if "original_sizes" in inputs and inputs["original_sizes"].shape[0] == 1:
-                inputs["original_sizes"] = inputs["original_sizes"].repeat(target_batch_size, 1)
-            if "reshape_input_sizes" in inputs and inputs["reshape_input_sizes"].shape[0] == 1:
-                inputs["reshape_input_sizes"] = inputs["reshape_input_sizes"].repeat(target_batch_size, 1)
         # Run inference
@@ -206,8 +252,15 @@ class SAM3Segmenter(Segmenter):
         prompts = text_prompts or ["object"]
-        # Same prompts for all images
-        inputs = self.processor(images=pil_images, text=[prompts]*len(frames), return_tensors="pt").to(self.device)
         with torch.no_grad():
             outputs = self.model(**inputs)

             boxes=boxes_array,
         )
+    def _expand_inputs_if_needed(self, inputs):
+        """
+        Helper to expand vision inputs (pixel_values or vision_embeds) to match text prompts.
+        Handles:
+        1. 1 image, N texts (Expand 1 -> N)
+        2. N images, N*M texts (Expand N -> N*M)
+        """
+        pixel_values = inputs.get("pixel_values")
+        input_ids = inputs.get("input_ids")
+        if (
+            pixel_values is not None
+            and input_ids is not None
+        ):
+            img_batch = pixel_values.shape[0]
+            text_batch = input_ids.shape[0]
+            should_expand = False
+            expansion_factor = 1
+            if img_batch == 1 and text_batch > 1:
+                should_expand = True
+                expansion_factor = text_batch
+            elif img_batch > 1 and text_batch > img_batch and text_batch % img_batch == 0:
+                should_expand = True
+                expansion_factor = text_batch // img_batch
+            if should_expand:
+                logging.debug(f"Expanding SAM3 vision inputs from {img_batch} to {text_batch} (factor {expansion_factor}) using embeddings reuse.")
+                # 1. Compute vision embeddings once for original images
+                with torch.no_grad():
+                    vision_outputs = self.model.get_vision_features(
+                        pixel_values=pixel_values
+                    )
+                # Iterate over keys to expand
+                keys_to_expand = list(vision_outputs.keys())
+                for key in keys_to_expand:
+                    value = getattr(vision_outputs, key, None)
+                    if value is None:
+                        # Try getItem
+                        try:
+                            value = vision_outputs[key]
+                        except:
+                            continue
+                    new_value = None
+                    if isinstance(value, torch.Tensor):
+                        # Ensure we only expand the batch dimension (dim 0)
+                        if value.shape[0] == img_batch:
+                             new_value = value.repeat_interleave(expansion_factor, dim=0)
+                    elif isinstance(value, (list, tuple)):
+                        new_list = []
+                        valid_expansion = False
+                        for i, v in enumerate(value):
+                            if isinstance(v, torch.Tensor) and v.shape[0] == img_batch:
+                                new_list.append(v.repeat_interleave(expansion_factor, dim=0))
+                                valid_expansion = True
+                            else:
+                                new_list.append(v)
+                        if valid_expansion:
+                            # Preserve type
+                            new_value = type(value)(new_list)
+                    if new_value is not None:
+                         # Update dict item if possible
+                         try:
+                            vision_outputs[key] = new_value
+                         except:
+                            pass
+                         # Update attribute explicitly if it exists
+                         if hasattr(vision_outputs, key):
+                             setattr(vision_outputs, key, new_value)
+                # 3. Update inputs for model call
+                inputs["vision_embeds"] = vision_outputs
+                del inputs["pixel_values"] # Mutually exclusive with vision_embeds
+                # 4. Expand other metadata
+                if "original_sizes" in inputs and inputs["original_sizes"].shape[0] == img_batch:
+                    inputs["original_sizes"] = inputs["original_sizes"].repeat_interleave(expansion_factor, dim=0)
+                if "reshape_input_sizes" in inputs and inputs["reshape_input_sizes"].shape[0] == img_batch:
+                    inputs["reshape_input_sizes"] = inputs["reshape_input_sizes"].repeat_interleave(expansion_factor, dim=0)
     def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
         """
         Run SAM3 segmentation on a frame.
             images=pil_image, text=text_prompts, return_tensors="pt"
         ).to(self.device)
+        # Handle batch expansion
+        self._expand_inputs_if_needed(inputs)
         # Run inference
         prompts = text_prompts or ["object"]
+        # Flatten prompts for all images: [img1_p1, img1_p2, img2_p1, img2_p2, ...]
+        flattened_prompts = []
+        for _ in frames:
+            flattened_prompts.extend(prompts)
+        inputs = self.processor(images=pil_images, text=flattened_prompts, return_tensors="pt").to(self.device)
+        # Handle batch expansion
+        self._expand_inputs_if_needed(inputs)
         with torch.no_grad():
             outputs = self.model(**inputs)