Upload 5 files

Browse files

Files changed (5) hide show

utils/EdgeTAM_image_predictor.py +394 -0
utils/EdgeTAM_image_predictor_onnx.py +400 -0
utils/__pycache__/EdgeTAM_image_predictor.cpython-311.pyc +0 -0
utils/__pycache__/transforms.cpython-311.pyc +0 -0
utils/transforms.py +139 -0

utils/EdgeTAM_image_predictor.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from PIL.Image import Image
+from utils.transforms import SAM2Transforms, trunc_normal_
+# import onnxruntime as ort
+import axengine as ort
+import cv2
+import os
+class ImagePredictor:
+    def __init__(
+        self,
+        model_path,
+        mask_threshold=0.0,
+        max_hole_area=0.0,
+        max_sprinkle_area=0.0,
+        resolution=1024,
+        **kwargs,
+    ) -> None:
+        """
+        Uses SAM-2 to calculate the image embedding for an image, and then
+        allow repeated, efficient mask prediction given prompts.
+        Arguments:
+          sam_model (Sam-2): The model to use for mask prediction.
+          mask_threshold (float): The threshold to use when converting mask logits
+            to binary masks. Masks are thresholded at 0 by default.
+          max_hole_area (int): If max_hole_area > 0, we fill small holes in up to
+            the maximum area of max_hole_area in low_res_masks.
+          max_sprinkle_area (int): If max_sprinkle_area > 0, we remove small sprinkles up to
+            the maximum area of max_sprinkle_area in low_res_masks.
+        """
+        super().__init__()
+        print("Loading EdgeTAM Onnx models...")
+        self.image_encoder = ort.InferenceSession(f"{model_path}/edgetam_image_encoder.axmodel")
+        self.prompt_encoder = ort.InferenceSession(f"{model_path}/edgetam_prompt_encoder.axmodel")
+        self.prompt_mask_encoder = ort.InferenceSession(f"{model_path}/edgetam_prompt_mask_encoder.axmodel")
+        self.mask_decoder = ort.InferenceSession(f"{model_path}/edgetam_mask_decoder.axmodel")
+        self.model_path = model_path
+        self._transforms = SAM2Transforms(
+            resolution=resolution,
+            mask_threshold=mask_threshold,
+            max_hole_area=max_hole_area,
+            max_sprinkle_area=max_sprinkle_area,
+        )
+        # Predictor state
+        self._is_image_set = False
+        self._features = None
+        self._orig_hw = None
+        # Whether the predictor is set for single image or a batch of images
+        self._is_batch = False
+        # Predictor config
+        self.mask_threshold = mask_threshold
+        self.num_feature_levels = 3
+        self.no_mem_embed = np.zeros((1, 1, 256))
+        trunc_normal_(self.no_mem_embed, std=0.02)
+        # Spatial dim for backbone feature maps
+        self._bb_feat_sizes = [
+            (256, 256),
+            (128, 128),
+            (64, 64),
+        ]
+    def set_image(
+        self,
+        image: Union[np.ndarray, Image],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+        Arguments:
+          image (np.ndarray or PIL Image): The input image to embed in RGB format. The image should be in HWC format if np.ndarray, or WHC format if PIL Image
+          with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        self.reset_predictor()
+        # Transform the image to the form expected by the model
+        if isinstance(image, np.ndarray):
+            logging.info("For numpy array image, we assume (HxWxC) format")
+            self._orig_hw = [image.shape[:2]]
+        input_image = self._transforms(image).astype(np.float32) # return 3xHxW np.ndarray
+        input_image = input_image[None, ...]
+        # np.save(f"{self.path}/input_image.npy", input_image)
+        assert (
+            len(input_image.shape) == 4 and input_image.shape[1] == 3
+        ), f"input_image must be of size 1x3xHxW, got {input_image.shape}"
+        logging.info("Computing image embeddings for the provided image...")
+        vision_feats  = self.image_encoder.run(None, {"input_image": input_image.astype(np.float32)})
+        feats = [
+                    np.transpose(feat[:, 0, :].reshape(H, W, feat.shape[-1]), (2, 0, 1))[np.newaxis, :]
+                    for feat, (H, W) in zip(reversed(vision_feats), reversed(self._bb_feat_sizes))
+                ][::-1]
+        self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+        self._is_image_set = True
+        logging.info("Image embeddings computed.")
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+        normalize_coords=True,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+          normalize_coords (bool): If true, the point coordinates will be normalized to the range [0,1] and point_coords is expected to be wrt. image dimensions.
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+        # Transform input prompts
+        #type check
+        point_coords = point_coords.astype(np.float32) if point_coords is not None else None
+        point_labels = point_labels.astype(np.float32) if point_labels is not None else None
+        box = box.astype(np.float32) if box is not None else None
+        mask_input = mask_input.astype(np.float32) if mask_input is not None else None
+        mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts(
+            point_coords, point_labels, box, mask_input, normalize_coords
+        )
+        masks, iou_predictions, low_res_masks = self._predict(
+            unnorm_coords,
+            labels,
+            unnorm_box,
+            mask_input,
+            multimask_output,
+            return_logits=return_logits,
+        )
+        masks_np = masks
+        iou_predictions_np = iou_predictions[0]
+        low_res_masks_np = low_res_masks[0]
+        return masks_np, iou_predictions_np, low_res_masks_np
+    def _prep_prompts(
+        self, point_coords, point_labels, box, mask_logits, normalize_coords, img_idx=-1
+    ):
+        unnorm_coords, labels, unnorm_box, mask_input = None, None, None, None
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            unnorm_coords = self._transforms.transform_coords(
+                point_coords, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+            )
+            if len(unnorm_coords.shape) == 2:
+                unnorm_coords, labels = unnorm_coords[np.newaxis, ...], point_labels[np.newaxis, ...]
+        if box is not None:
+            unnorm_box = self._transforms.transform_boxes(
+                box, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+            )  # Bx2x2
+        if mask_logits is not None:
+            if len(mask_logits.shape) == 3:
+                mask_logits = mask_logits[np.newaxis, :, :, :]
+        return mask_logits, unnorm_coords, labels, unnorm_box
+    def _predict(
+        self,
+        point_coords,
+        point_labels,
+        boxes = None,
+        mask_input = None,
+        multimask_output = True,
+        return_logits = False,
+        img_idx = -1,
+    ):
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using SAM2Transforms.
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+        if point_coords is not None:
+            concat_points = (point_coords, point_labels)
+        else:
+            concat_points = None
+        # Embed prompts
+        if boxes is not None:
+            box_coords = boxes.reshape(-1, 2, 2)
+            box_labels = np.array([[2, 3]], dtype=np.float32)
+            box_labels = box_labels.repeat(boxes.shape[0], 1)
+            # we merge "boxes" and "points" into a single "concat_points" input (where
+            # boxes are added at the beginning) to sam_prompt_encoder
+            if concat_points is not None:
+                concat_coords = np.concatenate([box_coords, concat_points[0]], axis=1)
+                concat_labels = np.concatenate([box_labels, concat_points[1]], axis=1)
+                concat_points = (concat_coords, concat_labels)
+            else:
+                print("Only box input provided")
+                concat_points = (box_coords, box_labels)
+        # assert concat_points[0].shape[1] > 4, "only support points < 4"
+        input_coords = np.tile(concat_points[0], (4, 1))[:, :4, :]
+        input_labels = np.tile(concat_points[1], (4))[:, :4]
+        # print("sparse_embeddings_tmp shape:", sparse_embeddings_tmp.shape)
+        if mask_input.all() == 0:
+            print("Get dense_embeddings_no_mask")
+            sparse_embeddings = self.prompt_encoder.run(
+            None,
+            {
+                "point_coords": input_coords if concat_points is not None else np.array([]),
+                "point_labels": input_labels if concat_points is not None else np.array([])
+                # "boxes": boxes if boxes is not None else np.zeros((1, 4), dtype=np.float32)
+            },
+        )[0]
+            dense_embeddings = np.load(f"{self.model_path}/dense_embeddings_no_mask.npy")
+        else:
+            print("Get dense_embeddings_mask")
+            sparse_embeddings = self.prompt_encoder.run(
+            None,
+            {
+                "point_coords": input_coords if concat_points is not None else np.array([]),
+                "point_labels": input_labels if concat_points is not None else np.array([])
+                # "boxes": boxes if boxes is not None else np.zeros((1, 4), dtype=np.float32)
+            },
+        )[0]
+            dense_embeddings = self.prompt_mask_encoder.run(
+            None,
+            {
+                "input.1": mask_input
+            },
+        )[0]
+        # Predict masks
+        batched_mode = (
+            concat_points is not None and concat_points[0].shape[0] > 1
+        )  # multi object prediction
+        high_res_features = [
+            feat_level[img_idx][np.newaxis, ...]
+            for feat_level in self._features["high_res_feats"]
+        ]
+        low_res_masks, iou_predictions = self.mask_decoder.run(
+            None,
+            {
+                "image_embeddings": self._features["image_embed"][img_idx][np.newaxis, ...],
+                # "image_pe": image_pe,
+                "sparse_prompt_embeddings": sparse_embeddings,
+                "dense_prompt_embeddings": dense_embeddings,
+                "high_res_feat_0": high_res_features[0],
+                "high_res_feat_1": high_res_features[1],
+                # "multimask_output": np.array([1 if multimask_output else 0], dtype=np.int32),
+            },
+        )
+        # Upscale the masks to the original image resolution
+        mask = low_res_masks[0].transpose(1, 2, 0)  # HxWxC
+        resize_masks = cv2.resize(mask, (self._orig_hw[img_idx][1], self._orig_hw[img_idx][0]), interpolation=cv2.INTER_LINEAR)
+        resize_masks = resize_masks[np.newaxis, ...]  # HxWx1xC
+        resize_masks = np.clip(resize_masks, -32.0, 32.0) # 1xCxHxW
+        if not return_logits:
+            resize_masks = resize_masks > self.mask_threshold
+        return resize_masks, iou_predictions, low_res_masks
+    def get_image_embedding(self):
+        """
+        Returns the image embeddings for the currently set image, with
+        shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+        the embedding spatial dimension of SAM (typically C=256, H=W=64).
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) to generate an embedding."
+            )
+        assert (
+            self._features is not None
+        ), "Features must exist if an image has been set."
+        return self._features["image_embed"]
+    def reset_predictor(self) -> None:
+        """
+        Resets the image embeddings and other state variables.
+        """
+        self._is_image_set = False
+        self._features = None
+        self._orig_hw = None
+        self._is_batch = False
+    def _prepare_backbone_features(self, backbone_out):
+        """Prepare and flatten visual features."""
+        backbone_out = backbone_out.copy()
+        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
+        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
+        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
+        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+        # flatten NxCxHxW to HWxNxC
+        vision_feats = [x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 0, 1) for x in feature_maps]
+        vision_pos_embeds = [x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 0, 1) for x in vision_pos_embeds]
+        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes

utils/EdgeTAM_image_predictor_onnx.py ADDED Viewed

	@@ -0,0 +1,400 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from PIL.Image import Image
+from utils.transforms import SAM2Transforms, trunc_normal_
+import onnxruntime as ort
+# import axengine as ort
+import cv2
+import os
+class ImagePredictor:
+    def __init__(
+        self,
+        model_path,
+        mask_threshold=0.0,
+        max_hole_area=0.0,
+        max_sprinkle_area=0.0,
+        resolution=1024,
+        **kwargs,
+    ) -> None:
+        """
+        Uses SAM-2 to calculate the image embedding for an image, and then
+        allow repeated, efficient mask prediction given prompts.
+        Arguments:
+          sam_model (Sam-2): The model to use for mask prediction.
+          mask_threshold (float): The threshold to use when converting mask logits
+            to binary masks. Masks are thresholded at 0 by default.
+          max_hole_area (int): If max_hole_area > 0, we fill small holes in up to
+            the maximum area of max_hole_area in low_res_masks.
+          max_sprinkle_area (int): If max_sprinkle_area > 0, we remove small sprinkles up to
+            the maximum area of max_sprinkle_area in low_res_masks.
+        """
+        super().__init__()
+        print("Loading EdgeTAM Onnx models...")
+        self.image_encoder = ort.InferenceSession(f"{model_path}/edgetam_image_encoder.onnx")
+        self.prompt_encoder = ort.InferenceSession(f"{model_path}/edgetam_prompt_encoder.onnx")
+        self.prompt_mask_encoder = ort.InferenceSession(f"{model_path}/edgetam_prompt_mask_encoder.onnx")
+        self.mask_decoder = ort.InferenceSession(f"{model_path}/edgetam_mask_decoder.onnx")
+        self.model_path = model_path
+        self._transforms = SAM2Transforms(
+            resolution=resolution,
+            mask_threshold=mask_threshold,
+            max_hole_area=max_hole_area,
+            max_sprinkle_area=max_sprinkle_area,
+            onnx=True
+        )
+        # Predictor state
+        self._is_image_set = False
+        self._features = None
+        self._orig_hw = None
+        # Whether the predictor is set for single image or a batch of images
+        self._is_batch = False
+        # Predictor config
+        self.mask_threshold = mask_threshold
+        self.num_feature_levels = 3
+        self.no_mem_embed = np.zeros((1, 1, 256))
+        trunc_normal_(self.no_mem_embed, std=0.02)
+        # Spatial dim for backbone feature maps
+        self._bb_feat_sizes = [
+            (256, 256),
+            (128, 128),
+            (64, 64),
+        ]
+    def set_image(
+        self,
+        image: Union[np.ndarray, Image],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+        Arguments:
+          image (np.ndarray or PIL Image): The input image to embed in RGB format. The image should be in HWC format if np.ndarray, or WHC format if PIL Image
+          with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        self.reset_predictor()
+        # Transform the image to the form expected by the model
+        if isinstance(image, np.ndarray):
+            logging.info("For numpy array image, we assume (HxWxC) format")
+            self._orig_hw = [image.shape[:2]]
+        input_image = self._transforms(image).astype(np.float32) # return 3xHxW np.ndarray
+        input_image = input_image[None, ...]
+        # np.save(f"{self.path}/input_image.npy", input_image)
+        assert (
+            len(input_image.shape) == 4 and input_image.shape[1] == 3
+        ), f"input_image must be of size 1x3xHxW, got {input_image.shape}"
+        logging.info("Computing image embeddings for the provided image...")
+        vision_feats  = self.image_encoder.run(None, {"input_image": input_image.astype(np.float32)})
+        feats = [
+                    np.transpose(feat[:, 0, :].reshape(H, W, feat.shape[-1]), (2, 0, 1))[np.newaxis, :]
+                    for feat, (H, W) in zip(reversed(vision_feats), reversed(self._bb_feat_sizes))
+                ][::-1]
+        self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+        self._is_image_set = True
+        logging.info("Image embeddings computed.")
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+        normalize_coords=True,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+          normalize_coords (bool): If true, the point coordinates will be normalized to the range [0,1] and point_coords is expected to be wrt. image dimensions.
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+        # Transform input prompts
+        #type check
+        point_coords = point_coords.astype(np.float32) if point_coords is not None else None
+        point_labels = point_labels.astype(np.float32) if point_labels is not None else None
+        box = box.astype(np.float32) if box is not None else None
+        mask_input = mask_input.astype(np.float32) if mask_input is not None else None
+        mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts(
+            point_coords, point_labels, box, mask_input, normalize_coords
+        )
+        masks, iou_predictions, low_res_masks = self._predict(
+            unnorm_coords,
+            labels,
+            unnorm_box,
+            mask_input,
+            multimask_output,
+            return_logits=return_logits,
+        )
+        masks_np = masks
+        iou_predictions_np = iou_predictions[0]
+        low_res_masks_np = low_res_masks[0]
+        return masks_np, iou_predictions_np, low_res_masks_np
+    def _prep_prompts(
+        self, point_coords, point_labels, box, mask_logits, normalize_coords, img_idx=-1
+    ):
+        unnorm_coords, labels, unnorm_box, mask_input = None, None, None, None
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            unnorm_coords = self._transforms.transform_coords(
+                point_coords, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+            )
+            if len(unnorm_coords.shape) == 2:
+                unnorm_coords, labels = unnorm_coords[np.newaxis, ...], point_labels[np.newaxis, ...]
+        if box is not None:
+            unnorm_box = self._transforms.transform_boxes(
+                box, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+            )  # Bx2x2
+        if mask_logits is not None:
+            if len(mask_logits.shape) == 3:
+                mask_logits = mask_logits[np.newaxis, :, :, :]
+        return mask_logits, unnorm_coords, labels, unnorm_box
+    def _predict(
+        self,
+        point_coords,
+        point_labels,
+        boxes = None,
+        mask_input = None,
+        multimask_output = True,
+        return_logits = False,
+        img_idx = -1,
+    ):
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using SAM2Transforms.
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+        if point_coords is not None:
+            concat_points = (point_coords, point_labels)
+        else:
+            concat_points = None
+        # Embed prompts
+        if boxes is not None:
+            box_coords = boxes.reshape(-1, 2, 2)
+            box_labels = np.array([[2, 3]], dtype=np.float32)
+            box_labels = box_labels.repeat(boxes.shape[0], 1)
+            # we merge "boxes" and "points" into a single "concat_points" input (where
+            # boxes are added at the beginning) to sam_prompt_encoder
+            if concat_points is not None:
+                concat_coords = np.concatenate([box_coords, concat_points[0]], axis=1)
+                concat_labels = np.concatenate([box_labels, concat_points[1]], axis=1)
+                concat_points = (concat_coords, concat_labels)
+            else:
+                print("Only box input provided")
+                concat_points = (box_coords, box_labels)
+        # assert concat_points[0].shape[1] > 4, "only support points < 4"
+        input_coords = np.tile(concat_points[0], (4, 1))[:, :4, :]
+        input_labels = np.tile(concat_points[1], (4))[:, :4]
+        # print("sparse_embeddings_tmp shape:", sparse_embeddings_tmp.shape)
+        if mask_input.all() == 0:
+            print("Get dense_embeddings_no_mask")
+            sparse_embeddings = self.prompt_encoder.run(
+            None,
+            {
+                "point_coords": input_coords if concat_points is not None else np.array([]),
+                "point_labels": input_labels if concat_points is not None else np.array([])
+                # "boxes": boxes if boxes is not None else np.zeros((1, 4), dtype=np.float32)
+            },
+        )[0]
+            # np.save(f"{self.path}/dense_embeddings_no_mask.npy", dense_embeddings)
+            dense_embeddings = np.load(f"{self.model_path}/dense_embeddings_no_mask.npy")
+            np.save(f"{self.model_path}/point_coords.npy", input_coords)
+            np.save(f"{self.model_path}/point_labels.npy", input_labels)
+        else:
+            print("Get dense_embeddings_mask")
+            sparse_embeddings = self.prompt_encoder.run(
+            None,
+            {
+                "point_coords": input_coords if concat_points is not None else np.array([]),
+                "point_labels": input_labels if concat_points is not None else np.array([])
+                # "boxes": boxes if boxes is not None else np.zeros((1, 4), dtype=np.float32)
+            },
+        )[0]
+            dense_embeddings = self.prompt_mask_encoder.run(
+            None,
+            {
+                "input.1": mask_input
+            },
+        )[0]
+        # Predict masks
+        batched_mode = (
+            concat_points is not None and concat_points[0].shape[0] > 1
+        )  # multi object prediction
+        high_res_features = [
+            feat_level[img_idx][np.newaxis, ...]
+            for feat_level in self._features["high_res_feats"]
+        ]
+        low_res_masks, iou_predictions = self.mask_decoder.run(
+            None,
+            {
+                "image_embeddings": self._features["image_embed"][img_idx][np.newaxis, ...],
+                # "image_pe": image_pe,
+                "sparse_prompt_embeddings": sparse_embeddings,
+                "dense_prompt_embeddings": dense_embeddings,
+                "high_res_feat_0": high_res_features[0],
+                "high_res_feat_1": high_res_features[1],
+                # "multimask_output": np.array([1 if multimask_output else 0], dtype=np.int32),
+            },
+        )
+        # Upscale the masks to the original image resolution
+        mask = low_res_masks[0].transpose(1, 2, 0)  # HxWxC
+        resize_masks = cv2.resize(mask, (self._orig_hw[img_idx][1], self._orig_hw[img_idx][0]), interpolation=cv2.INTER_LINEAR)
+        resize_masks = resize_masks[np.newaxis, ...]  # HxWx1xC
+        resize_masks = np.clip(resize_masks, -32.0, 32.0) # 1xCxHxW
+        if not return_logits:
+            resize_masks = resize_masks > self.mask_threshold
+        return resize_masks, iou_predictions, low_res_masks
+    def get_image_embedding(self):
+        """
+        Returns the image embeddings for the currently set image, with
+        shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+        the embedding spatial dimension of SAM (typically C=256, H=W=64).
+        """
+        if not self._is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) to generate an embedding."
+            )
+        assert (
+            self._features is not None
+        ), "Features must exist if an image has been set."
+        return self._features["image_embed"]
+    def reset_predictor(self) -> None:
+        """
+        Resets the image embeddings and other state variables.
+        """
+        self._is_image_set = False
+        self._features = None
+        self._orig_hw = None
+        self._is_batch = False
+    def _prepare_backbone_features(self, backbone_out):
+        """Prepare and flatten visual features."""
+        backbone_out = backbone_out.copy()
+        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
+        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
+        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
+        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+        # flatten NxCxHxW to HWxNxC
+        vision_feats = [x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 0, 1) for x in feature_maps]
+        vision_pos_embeds = [x.reshape(x.shape[0], x.shape[1], -1).transpose(2, 0, 1) for x in vision_pos_embeds]
+        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes

utils/__pycache__/EdgeTAM_image_predictor.cpython-311.pyc ADDED Viewed

Binary file (20.9 kB). View file

utils/__pycache__/transforms.cpython-311.pyc ADDED Viewed

Binary file (7.15 kB). View file

utils/transforms.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+import albumentations as A
+import numpy as np
+from scipy.stats import truncnorm
+import cv2
+class SAM2Transforms():
+    def __init__(
+        self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0, onnx=False
+    ):
+        """
+        Transforms for SAM2.
+        """
+        super().__init__()
+        self.resolution = resolution
+        self.mask_threshold = mask_threshold
+        self.max_hole_area = max_hole_area
+        self.max_sprinkle_area = max_sprinkle_area
+        self.transforms = A.Compose([
+                                    A.Resize(height=resolution, width=resolution),  # 先 resize
+                                    A.Normalize(mean=[0.485, 0.456, 0.406],   # ImageNet RGB mean
+                                                std=[0.229, 0.224, 0.225],    # ImageNet RGB std
+                                                max_pixel_value=255.0,        # 因为输入是 0-255 的 uint8
+                                                p=1.0)
+                                    ])
+        self.onnx = onnx
+    def __call__(self, x):
+        #x: np.ndarray, HWC, uint8, RGB
+        # x_normal = cv2.resize(x, (self.resolution, self.resolution), interpolation=cv2.INTER_LINEAR)
+        if self.onnx:
+            x_normal = self.transforms(image=x)['image']
+            return x_normal.transpose(2, 0, 1)
+        else:
+            x_normal = cv2.resize(x, (self.resolution, self.resolution), interpolation=cv2.INTER_LINEAR)
+            return x_normal.transpose(2, 0, 1)
+    def forward_batch(self, img_list):
+        #img_list: list of np.ndarray, HWC, uint8, RGB
+        img_batch = [self.transforms(img) for img in img_list]
+        img_batch = np.concatenate([img[np.newaxis, :].transpose(0, 3, 1, 2) for img in img_batch], axis=0)
+        return img_batch
+    def transform_coords(
+        self, coords, normalize=False, orig_hw=None
+    ):
+        """
+        Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
+        If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+        Returns
+            Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model.
+        """
+        if normalize:
+            assert orig_hw is not None
+            h, w = orig_hw
+            coords = coords.copy()
+            coords[..., 0] = coords[..., 0] / w
+            coords[..., 1] = coords[..., 1] / h
+        coords = coords * self.resolution
+        return coords
+    def transform_boxes(
+        self, boxes, normalize=False, orig_hw=None
+    ):
+        """
+        Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates,
+        if the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+        """
+        boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw)
+        return boxes
+    """
+    def postprocess_masks(self, masks, orig_hw):
+        # Perform PostProcessing on output masks.
+        from sam2.utils.misc import get_connected_components
+        masks = masks.float()
+        input_masks = masks
+        mask_flat = masks.flatten(0, 1).unsqueeze(1)  # flatten as 1-channel image
+        try:
+            if self.max_hole_area > 0:
+                # Holes are those connected components in background with area <= self.fill_hole_area
+                # (background regions are those with mask scores <= self.mask_threshold)
+                labels, areas = get_connected_components(
+                    mask_flat <= self.mask_threshold
+                )
+                is_hole = (labels > 0) & (areas <= self.max_hole_area)
+                is_hole = is_hole.reshape_as(masks)
+                # We fill holes with a small positive mask score (10.0) to change them to foreground.
+                masks = torch.where(is_hole, self.mask_threshold + 10.0, masks)
+            if self.max_sprinkle_area > 0:
+                labels, areas = get_connected_components(
+                    mask_flat > self.mask_threshold
+                )
+                is_hole = (labels > 0) & (areas <= self.max_sprinkle_area)
+                is_hole = is_hole.reshape_as(masks)
+                # We fill holes with negative mask score (-10.0) to change them to background.
+                masks = torch.where(is_hole, self.mask_threshold - 10.0, masks)
+        except Exception as e:
+            # Skip the post-processing step if the CUDA kernel fails
+            warnings.warn(
+                f"{e}\n\nSkipping the post-processing step due to the error above. You can "
+                "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
+                "functionality may be limited (which doesn't affect the results in most cases; see "
+                "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            masks = input_masks
+        masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False)
+        return masks
+    """
+def trunc_normal_(arr, std=0.02, mean=0.0):
+    """
+    用截断正态分布原地初始化 numpy array
+    截断范围: [mean - 2*std, mean + 2*std]
+    """
+    # 计算截断边界（以标准差为单位）
+    a = (mean - 2 * std - mean) / std  # = -2
+    b = (mean + 2 * std - mean) / std  # = +2
+    # 生成截断正态分布样本
+    samples = truncnorm.rvs(a, b, loc=mean, scale=std, size=arr.shape)
+    # 原地赋值
+    arr[:] = samples