Spaces:

NN-BRD
/

hackathon_depth_segment

Runtime error

App Files Files Community

s194649 commited on Aug 19, 2023

Commit

640f5b4

1 Parent(s): ae1d3bb

encoder decoder setup

Browse files

Files changed (2) hide show

app.py +9 -7
inference.py +70 -6

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from utils import generate_PCL, PCL3, point_cloud
 sam = SegmentPredictor()
 dpt = DepthPredictor()
 red = (255,0,0)
 blue = (0,0,255)
@@ -30,6 +31,7 @@ with block:
     cutout_idx = gr.State(set())
     pred_masks = gr.State([])
     prompt_masks = gr.State([])
     # UI
     with gr.Column():
@@ -73,7 +75,7 @@ with block:
                 sam_decode_btn = gr.Button('Predict using points!', variant = 'primary')
     # components
-    components = {point_coords, point_labels, image_edit_trigger, masks, cutout_idx, input_image,
                   point_label_radio, text, reset_btn, sam_sgmt_everything_btn,
                   sam_decode_btn, depth_reconstruction_btn, prompt_image, lbl_image, n_samples, max_depth, min_depth, cube_size, selected_masks_image}
@@ -88,7 +90,7 @@ with block:
         return input_image, point_coords_empty(), point_labels_empty(), None, []
     reset_btn.click(on_reset_btn_click, [input_image], [input_image, point_coords, point_labels], queue=False)
-    def on_prompt_image_select(input_image, prompt_image, point_coords, point_labels, point_label_radio, text, pred_masks, evt: gr.SelectData):
         x, y = evt.index
         color = red if point_label_radio == 0 else blue
         if prompt_image is None:
@@ -97,7 +99,7 @@ with block:
         cv2.circle(prompt_image, (x, y), 5, color, -1)
         point_coords.append([x,y])
         point_labels.append(point_label_radio)
-        sam_masks = sam.cond_pred(pts=np.array(point_coords), lbls=np.array(point_labels))
         return  [ prompt_image,
                   (input_image, sam_masks),
                   point_coords,
@@ -105,7 +107,7 @@ with block:
                   sam_masks ]
     prompt_image.select(on_prompt_image_select,
-                       [input_image, prompt_image, point_coords, point_labels, point_label_radio, text, pred_masks],
                        [prompt_image, lbl_image, point_coords, point_labels, pred_masks], queue=False)
@@ -139,10 +141,10 @@ with block:
     def on_click_sam_encode_btn(inputs):
         print("encoding")
         # encode image on click
-        sam.encode(inputs[input_image])
         print("encoding done")
-        return {prompt_image: inputs[input_image]}
-    sam_encode_btn.click(on_click_sam_encode_btn, components, [prompt_image], queue=False)
     def on_click_sam_dencode_btn(inputs):
         print("inferencing")

 sam = SegmentPredictor()
+sam_cpu = SegmentPredictor(device='cpu')
 dpt = DepthPredictor()
 red = (255,0,0)
 blue = (0,0,255)
     cutout_idx = gr.State(set())
     pred_masks = gr.State([])
     prompt_masks = gr.State([])
+    embedding = gr.State()
     # UI
     with gr.Column():
                 sam_decode_btn = gr.Button('Predict using points!', variant = 'primary')
     # components
+    components = {point_coords, point_labels, image_edit_trigger, masks, cutout_idx, input_image, embedding,
                   point_label_radio, text, reset_btn, sam_sgmt_everything_btn,
                   sam_decode_btn, depth_reconstruction_btn, prompt_image, lbl_image, n_samples, max_depth, min_depth, cube_size, selected_masks_image}
         return input_image, point_coords_empty(), point_labels_empty(), None, []
     reset_btn.click(on_reset_btn_click, [input_image], [input_image, point_coords, point_labels], queue=False)
+    def on_prompt_image_select(input_image, prompt_image, point_coords, point_labels, point_label_radio, text, pred_masks, embedding, evt: gr.SelectData):
         x, y = evt.index
         color = red if point_label_radio == 0 else blue
         if prompt_image is None:
         cv2.circle(prompt_image, (x, y), 5, color, -1)
         point_coords.append([x,y])
         point_labels.append(point_label_radio)
+        sam_masks = sam_cpu.cond_pred(pts=np.array(point_coords), lbls=np.array(point_labels), embedding=embedding)
         return  [ prompt_image,
                   (input_image, sam_masks),
                   point_coords,
                   sam_masks ]
     prompt_image.select(on_prompt_image_select,
+                       [input_image, prompt_image, point_coords, point_labels, point_label_radio, text, pred_masks, embedding],
                        [prompt_image, lbl_image, point_coords, point_labels, pred_masks], queue=False)
     def on_click_sam_encode_btn(inputs):
         print("encoding")
         # encode image on click
+        embedding = sam.encode(inputs[input_image]).cpu()
         print("encoding done")
+        return [inputs[input_image], embedding]
+    sam_encode_btn.click(on_click_sam_encode_btn, components, [prompt_image, embedding], queue=False)
     def on_click_sam_dencode_btn(inputs):
         print("inferencing")

inference.py CHANGED Viewed

@@ -11,6 +11,10 @@ import pandas as pd
 import plotly.express as px
 import matplotlib.pyplot as plt
 def map_image_range(image, min_value, max_value):
     """
     Maps the values of a numpy image array to a specified range.
@@ -188,26 +192,86 @@ class DepthPredictor:
 class SegmentPredictor:
-    def __init__(self):
         MODEL_TYPE = "vit_h"
         checkpoint = "sam_vit_h_4b8939.pth"
         sam = sam_model_registry[MODEL_TYPE](checkpoint=checkpoint)
         # Select device
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         sam.to(device=self.device)
         self.mask_generator = SamAutomaticMaskGenerator(sam)
-        self.conditioned_pred = SamPredictor(sam)
     def encode(self, image):
         image = np.array(image)
-        self.conditioned_pred.set_image(image)
-    def cond_pred(self, pts, lbls):
         lbls = np.array(lbls)
         pts = np.array(pts)
-        masks, _, _ = self.conditioned_pred.predict(
             point_coords=pts,
             point_labels=lbls,
             multimask_output=True

 import plotly.express as px
 import matplotlib.pyplot as plt
 def map_image_range(image, min_value, max_value):
     """
     Maps the values of a numpy image array to a specified range.
+import numpy as np
+from typing import Optional, Tuple
+class CustomSamPredictor(SamPredictor):
+    def __init__(
+        self,
+        sam_model,
+    ) -> None:
+        super().__init__(sam_model)
+    def encode_image(self, image: np.ndarray, image_format: str = "RGB") -> torch.Tensor:
+        """
+        Encodes the image and returns its embedding.
+        Arguments:
+          image (np.ndarray): The image for which to calculate the embedding.
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        Returns:
+          torch.Tensor: The image embedding with shape 1xCxHxW.
+        """
+        self.set_image(image, image_format)
+        return self.get_image_embedding()
+    def decode_and_predict(
+        self,
+        embedding: torch.Tensor,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Decodes the provided image embedding and makes mask predictions based on prompts.
+        Arguments:
+          embedding (torch.Tensor): The image embedding to decode.
+          ... (other arguments from the predict function)
+        Returns:
+          (np.ndarray): The output masks in CxHxW format.
+          (np.ndarray): An array of quality predictions for each mask.
+          (np.ndarray): Low resolution mask logits for subsequent iterations.
+        """
+        self.set_torch_image(embedding, (embedding.shape[-2], embedding.shape[-1]))
+        return self.predict(
+            point_coords=point_coords,
+            point_labels=point_labels,
+            box=box,
+            mask_input=mask_input,
+            multimask_output=multimask_output,
+            return_logits=return_logits,
+        )
 class SegmentPredictor:
+    def __init__(self, device=None):
         MODEL_TYPE = "vit_h"
         checkpoint = "sam_vit_h_4b8939.pth"
         sam = sam_model_registry[MODEL_TYPE](checkpoint=checkpoint)
         # Select device
+        if device is None:
+            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        else:
+            self.device = device
         sam.to(device=self.device)
         self.mask_generator = SamAutomaticMaskGenerator(sam)
+        self.conditioned_pred = CustomSamPredictor(sam)
     def encode(self, image):
         image = np.array(image)
+        return self.encode_image(image)
+    def cond_pred(self, embedding, pts, lbls):
         lbls = np.array(lbls)
         pts = np.array(pts)
+        masks, _, _ = self.conditioned_pred.decode_and_predict(
+            embedding,
             point_coords=pts,
             point_labels=lbls,
             multimask_output=True