rushabh14
/

sam-vit-base-with-handler

@@ -24,111 +24,97 @@ class EndpointHandler():
             self.processor = SamProcessor.from_pretrained(path)
         except Exception as e:
             # Fallback to loading from a known SAM model if local loading fails
-            print(f"Failed to load from local path: {e}")
             print("Attempting to load from facebook/sam-vit-base")
             self.model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
             self.processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-    def __call__(self, data: Any) -> Any:
         """
         Called on every HTTP request.
-        Args:
-            data (:obj:):
-                includes the input data and the parameters for the inference.
         """
-        inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", {})
-        img = Image.open(io.BytesIO(inputs))
-        # img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        # img = raw_images[0]
-        # SAM requires input prompts, so we'll generate a center point prompt
-        height, width = img.size[1], img.size[0]  # PIL returns (width, height)
-        # Create a center point prompt for automatic segmentation
         input_points = [[[width // 2, height // 2]]]  # Center point
         input_labels = [[1]]  # Positive prompt
-        # Prepare inputs for the model with prompts
-        inputs = self.processor(img, input_points=input_points, input_labels=input_labels, return_tensors="pt")
-        # Generate masks using the model
         with torch.no_grad():
             outputs = self.model(**inputs)
         try:
-            # Get original image size
             original_height, original_width = inputs["original_sizes"][0].tolist()
-            # Get predicted masks and scores
-            pred_masks = outputs.pred_masks.cpu() # (batch, num_masks, H, W)
-            iou_scores = outputs.iou_scores.cpu()[0] # (num_masks,)
-            # The model might return 4D or 5D tensors. Squeeze if 5D.
             if pred_masks.ndim == 5:
                 pred_masks = pred_masks.squeeze(1)
-            # Select the best mask
             best_mask_idx = torch.argmax(iou_scores)
-            best_mask_tensor = pred_masks[0, best_mask_idx, :, :] # (H, W)
-            # Upscale the mask to original image size
-            # Add batch and channel dims for interpolate
             upscaled_mask = F.interpolate(
                 best_mask_tensor.unsqueeze(0).unsqueeze(0).float(),
                 size=(original_height, original_width),
                 mode='bilinear',
                 align_corners=False
-            ).squeeze() # remove batch/channel dims
-            # Convert to binary mask
             mask_binary = (upscaled_mask > 0.0).numpy().astype(np.uint8) * 255
         except Exception as e:
-            print(f"Error processing masks: {e}")
-            # Fallback
-            height, width = img.size[1], img.size[0]
             mask_binary = np.zeros((height, width), dtype=np.uint8)
             center_x, center_y = width // 2, height // 2
             size = min(width, height) // 8
             mask_binary[center_y-size:center_y+size, center_x-size:center_x+size] = 255
-        # Convert result to base64
         out = io.BytesIO()
         Image.fromarray(mask_binary).save(out, format="PNG")
         out.seek(0)
         mask_base64 = base64.b64encode(out.getvalue()).decode('utf-8')
-        # Decode the returned mask and save
-        mask_bytes = base64.b64decode(mask_base64)
-        mask_img = Image.open(io.BytesIO(mask_bytes)).convert("RGB")
-        # mask_img.save(output_path, format="JPEG")
-        # print(f"Wrote mask to {output_path}")
-        # Return in the expected format
-        return mask_img
 def main():
-    # Hardcoded input and output paths
     input_path = "/Users/rp7/Downloads/test.jpeg"
-    output_path = "output.jpg"
-    # Read and base64-encode the input image
     with open(input_path, "rb") as f:
         img_bytes = f.read()
     img_b64 = base64.b64encode(img_bytes).decode("utf-8")
-    data_url = f"data:image/jpeg;base64,{img_b64}"
     handler = EndpointHandler(path=".")
-    result = handler({"inputs": data_url})[0]
-    # Decode the returned mask and save
-    mask_bytes = base64.b64decode(result["mask_png_base64"])
-    mask_img = Image.open(io.BytesIO(mask_bytes)).convert("RGB")
-    mask_img.save(output_path, format="JPEG")
-    print(f"Wrote mask to {output_path}")
 if __name__ == "__main__":
     main()

             self.processor = SamProcessor.from_pretrained(path)
         except Exception as e:
             # Fallback to loading from a known SAM model if local loading fails
+            print("Failed to load from local path: {}".format(e))
             print("Attempting to load from facebook/sam-vit-base")
             self.model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
             self.processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+    def __call__(self, data):
         """
         Called on every HTTP request.
+        Expecting base64 encoded image in the 'inputs' field.
         """
+        # 1. Parse and decode the input image
+        image_data = data.pop("inputs", None)
+        if not image_data:
+            raise ValueError("Missing 'inputs' key with a base64 image string.")
+        if isinstance(image_data, str) and image_data.startswith("data:"):
+            image_data = image_data.split(",", 1)[1]
+        image_bytes = base64.b64decode(image_data)
+        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        # 2. Prepare prompts and process the image
+        height, width = img.size[1], img.size[0]
         input_points = [[[width // 2, height // 2]]]  # Center point
         input_labels = [[1]]  # Positive prompt
+        inputs = self.processor(img, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device)
+        # 3. Generate masks
         with torch.no_grad():
             outputs = self.model(**inputs)
+        # 4. Process and select the best mask
         try:
             original_height, original_width = inputs["original_sizes"][0].tolist()
+            pred_masks = outputs.pred_masks.cpu()
+            iou_scores = outputs.iou_scores.cpu()[0]
             if pred_masks.ndim == 5:
                 pred_masks = pred_masks.squeeze(1)
             best_mask_idx = torch.argmax(iou_scores)
+            best_mask_tensor = pred_masks[0, best_mask_idx, :, :]
             upscaled_mask = F.interpolate(
                 best_mask_tensor.unsqueeze(0).unsqueeze(0).float(),
                 size=(original_height, original_width),
                 mode='bilinear',
                 align_corners=False
+            ).squeeze()
             mask_binary = (upscaled_mask > 0.0).numpy().astype(np.uint8) * 255
         except Exception as e:
+            print("Error processing masks: {}".format(e))
             mask_binary = np.zeros((height, width), dtype=np.uint8)
             center_x, center_y = width // 2, height // 2
             size = min(width, height) // 8
             mask_binary[center_y-size:center_y+size, center_x-size:center_x+size] = 255
+        # 5. Encode the output image to base64
         out = io.BytesIO()
         Image.fromarray(mask_binary).save(out, format="PNG")
         out.seek(0)
         mask_base64 = base64.b64encode(out.getvalue()).decode('utf-8')
+        # 6. Return the response
+        return [{"mask_png_base64": mask_base64}]
 def main():
+    # This main function shows how a client would call the endpoint.
     input_path = "/Users/rp7/Downloads/test.jpeg"
+    output_path = "output.png"
+    # 1. Prepare the payload
     with open(input_path, "rb") as f:
         img_bytes = f.read()
     img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    payload = {"inputs": "data:image/jpeg;base64,{}".format(img_b64)}
+    # 2. Instantiate handler and call it
     handler = EndpointHandler(path=".")
+    result = handler(payload)
+    # 3. Process the response
+    mask_b64 = result[0]["mask_png_base64"]
+    mask_bytes = base64.b64decode(mask_b64)
+    mask_img = Image.open(io.BytesIO(mask_bytes))
+    mask_img.save(output_path)
+    print("Wrote mask to {}".format(output_path))
 if __name__ == "__main__":
     main()