tiiuae
/

Falcon-Perception

@@ -817,12 +817,64 @@ class FalconPerceptionForSegmentation(PreTrainedModel):
                         all_hw.append(v)
         return torch.tensor(all_xy), torch.tensor(all_hw)
     def _postprocess_aux(
         self,
         aux_list: list,
         pixel_mask_hw: T,
         orig_hw: tuple[int, int],
         threshold: float,
     ) -> list[dict]:
         """Convert raw aux outputs into structured detections with RLE masks."""
         orig_h, orig_w = orig_hw
@@ -838,8 +890,8 @@ class FalconPerceptionForSegmentation(PreTrainedModel):
             min_h = min_w = 0
             act_h = act_w = None
-        # Group into triplets: coord, size, mask
-        detections = []
         step = 3  # coord, size, mask
         for i in range(0, len(aux_list), step):
             if i + 2 >= len(aux_list):
@@ -861,15 +913,23 @@ class FalconPerceptionForSegmentation(PreTrainedModel):
             # Threshold
             binary_mask = (torch.sigmoid(mask_logits) > threshold).bool()
-            # Encode as COCO RLE
-            rle_list = self._mask_to_coco_rle(binary_mask.unsqueeze(0))
-            mask_rle = rle_list[0] if rle_list else {"counts": "", "size": [orig_h, orig_w]}
-            detections.append({
-                "xy": xy,
-                "hw": hw,
-                "mask_rle": mask_rle,
-            })
         return detections

                         all_hw.append(v)
         return torch.tensor(all_xy), torch.tensor(all_hw)
+    @staticmethod
+    def _mask_nms(
+        binary_masks: list[torch.Tensor],
+        iou_threshold: float = 0.6,
+        nms_max_side: int = 256,
+    ) -> list[int]:
+        """
+        Fast vectorised mask NMS on binary (H, W) tensors.
+        Returns the list of kept indices ordered by descending mask score.
+        The IoU matrix is computed via a single batched matmul; suppression
+        uses one GPU boolean op per kept mask — no .item() in the inner loop.
+        """
+        N = len(binary_masks)
+        if N <= 1:
+            return list(range(N))
+        device = binary_masks[0].device
+        base_h, base_w = binary_masks[0].shape
+        scale = min(1.0, nms_max_side / max(base_h, base_w))
+        th = max(1, int(round(base_h * scale)))
+        tw = max(1, int(round(base_w * scale)))
+        resized = []
+        for m in binary_masks:
+            m = m.float()
+            if m.shape != (th, tw):
+                m = F.interpolate(
+                    m[None, None], size=(th, tw), mode="bilinear", align_corners=False
+                ).squeeze()
+            resized.append(m)
+        binary = torch.stack(resized)          # (N, th, tw)
+        flat = binary.view(N, -1)              # (N, th*tw)
+        areas = flat.sum(dim=1)                # (N,)
+        scores = areas                         # larger mask = higher priority
+        intersection = flat @ flat.T           # (N, N)
+        union = areas[:, None] + areas[None, :] - intersection
+        iou = intersection / union.clamp(min=1)
+        order = scores.argsort(descending=True)
+        suppressed = torch.zeros(N, dtype=torch.bool, device=device)
+        keep = []
+        for idx in order.tolist():
+            if suppressed[idx]:
+                continue
+            keep.append(idx)
+            suppressed |= iou[idx] > iou_threshold
+        return keep
     def _postprocess_aux(
         self,
         aux_list: list,
         pixel_mask_hw: T,
         orig_hw: tuple[int, int],
         threshold: float,
+        nms_iou_threshold: float = 0.6,
     ) -> list[dict]:
         """Convert raw aux outputs into structured detections with RLE masks."""
         orig_h, orig_w = orig_hw
             min_h = min_w = 0
             act_h = act_w = None
+        # Group into triplets: coord, size, mask — build binary masks first
+        candidates = []
         step = 3  # coord, size, mask
         for i in range(0, len(aux_list), step):
             if i + 2 >= len(aux_list):
             # Threshold
             binary_mask = (torch.sigmoid(mask_logits) > threshold).bool()
+            candidates.append({"xy": xy, "hw": hw, "binary_mask": binary_mask})
+        if not candidates:
+            return []
+        # NMS on binary masks before RLE encoding
+        keep_indices = self._mask_nms(
+            [c["binary_mask"] for c in candidates],
+            iou_threshold=nms_iou_threshold,
+        )
+        candidates = [candidates[i] for i in keep_indices]
+        # Encode survivors as COCO RLE
+        detections = []
+        for c in candidates:
+            rle_list = self._mask_to_coco_rle(c["binary_mask"].unsqueeze(0))
+            mask_rle = rle_list[0] if rle_list else {"counts": "", "size": [orig_h, orig_w]}
+            detections.append({"xy": c["xy"], "hw": c["hw"], "mask_rle": mask_rle})
         return detections