wangli commited on Jan 30

Commit

c183d33

verified ·

1 Parent(s): 7e1afbf

Upload folder using huggingface_hub

Browse files

Files changed (41) hide show

.gitattributes +12 -0
README.md +46 -3
ax_model/mobile_sam_decoder_620E.axmodel +3 -0
ax_model/mobile_sam_decoder_650.axmodel +3 -0
ax_model/mobile_sam_encoder_620E.axmodel +3 -0
ax_model/mobile_sam_encoder_650.axmodel +3 -0
config.json +0 -0
images/box_mask_box_0.jpg +0 -0
images/box_mask_box_1.jpg +0 -0
images/box_mask_box_2.jpg +0 -0
images/box_mask_box_3.jpg +0 -0
images/box_mask_ovlap_box_0.jpg +3 -0
images/box_mask_ovlap_box_1.jpg +3 -0
images/box_mask_ovlap_box_2.jpg +3 -0
images/box_mask_ovlap_box_3.jpg +3 -0
images/car.jpg +3 -0
images/point_mask_ovlap_point_0.jpg +3 -0
images/point_mask_ovlap_point_1.jpg +3 -0
images/point_mask_ovlap_point_2.jpg +3 -0
images/point_mask_point_0.jpg +0 -0
images/point_mask_point_1.jpg +0 -0
images/point_mask_point_2.jpg +0 -0
images/test.jpg +3 -0
images/truck.jpg +3 -0
onnx/mobile_sam_decoder_slim.onnx +3 -0
onnx/mobile_sam_encoder.onnx +3 -0
python_ax/__pycache__/sam_decoder.cpython-312.pyc +0 -0
python_ax/__pycache__/sam_decoder.cpython-313.pyc +0 -0
python_ax/__pycache__/sam_encoder.cpython-312.pyc +0 -0
python_ax/__pycache__/sam_encoder.cpython-313.pyc +0 -0
python_ax/main.py +99 -0
python_ax/sam_decoder.py +40 -0
python_ax/sam_encoder.py +67 -0
python_onnx/__pycache__/sam_decoder.cpython-312.pyc +0 -0
python_onnx/__pycache__/sam_decoder.cpython-313.pyc +0 -0
python_onnx/__pycache__/sam_encoder.cpython-312.pyc +0 -0
python_onnx/__pycache__/sam_encoder.cpython-313.pyc +0 -0
python_onnx/main.py +70 -0
python_onnx/sam_decoder.py +38 -0
python_onnx/sam_encoder.py +62 -0
requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 ax_model/mobile_sam_decoder_620E.axmodel filter=lfs diff=lfs merge=lfs -text
 ax_model/mobile_sam_decoder_650.axmodel filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 ax_model/mobile_sam_decoder_620E.axmodel filter=lfs diff=lfs merge=lfs -text
 ax_model/mobile_sam_decoder_650.axmodel filter=lfs diff=lfs merge=lfs -text
+ax_model/mobile_sam_encoder_620E.axmodel filter=lfs diff=lfs merge=lfs -text
+ax_model/mobile_sam_encoder_650.axmodel filter=lfs diff=lfs merge=lfs -text
+images/box_mask_ovlap_box_0.jpg filter=lfs diff=lfs merge=lfs -text
+images/box_mask_ovlap_box_1.jpg filter=lfs diff=lfs merge=lfs -text
+images/box_mask_ovlap_box_2.jpg filter=lfs diff=lfs merge=lfs -text
+images/box_mask_ovlap_box_3.jpg filter=lfs diff=lfs merge=lfs -text
+images/car.jpg filter=lfs diff=lfs merge=lfs -text
+images/point_mask_ovlap_point_0.jpg filter=lfs diff=lfs merge=lfs -text
+images/point_mask_ovlap_point_1.jpg filter=lfs diff=lfs merge=lfs -text
+images/point_mask_ovlap_point_2.jpg filter=lfs diff=lfs merge=lfs -text
+images/test.jpg filter=lfs diff=lfs merge=lfs -text
+images/truck.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,46 @@
----
-license: afl-3.0
----

+# MobileSAM
+基于MobileSAM的图像分割Pipeline，支持多种输入提示（框、点、掩码），支持650N/620E系列平台的模型推理。
+支持芯片:
+- AX650N
+- AX620E
+- AX630C
+支持硬件
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://docs.m5stack.com/zh_CN/ai_hardware/LLM-8850_Card)
+原始模型请参考
+- [MobileSAM Github](https://github.com/ChaoningZhang/MobileSAM)
+## 性能对比
+- 输入图片大小 1024x1024
+|Chip| Models                | Latency (ms) | CMM Usage (MiB) |
+|----| --------------------- | ---------------------- | -------------- |
+|650N| mobile_sam_encoder          |49.495               | 48.334   |
+|650N| mobile_sam_encoder         |520.044              | 63.231   |
+|620E| mobile_sam_decoder          |9.930                | 16.703   |
+|620E| mobile_sam_decoder         |36.382               | 14.970   |
+## 模型转换
+- 模型转换工具链[Pulsar2](https://huggingface.co/AXERA-TECH/Pulsar2)
+- 转换文档[Model Convert](https://github.com/AXERA-TECH/MobileSAM.axera/tree/master/convert)
+## 环境准备
+- NPU Python API: [pyaxengine](https://github.com/AXERA-TECH/pyaxengine)
+安装需要的python库
+```pip install -r requirements.txt```
+## 运行
+```shell
+cd python_ax
+python3 main.py -i ../images/test.jpg -c 650
+```
+output:
+![point](./images/point_mask_ovlap_point_1.jpg)
+![box](./images/box_mask_ovlap_box_1.jpg)

ax_model/mobile_sam_decoder_620E.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f968ca9f3cb313de45cc8519b62dcae2c834613f58abce85a14f8d7cd8cc7fd
+size 7049493

ax_model/mobile_sam_decoder_650.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfa87a19cfa3a95518d789cc1e5684b1d8d3c8d19b4e1e8155de5b5f5627020e
+size 7031379

ax_model/mobile_sam_encoder_620E.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e941235ea497f63d91176cb6dfe856e52ccfcf6ec26157d75bcbd43aa31af851
+size 25503415

ax_model/mobile_sam_encoder_650.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88ad4b13478d7adfb6db44661933e5273207dfb15c1a74588fda860944501f9f
+size 16467039

config.json ADDED Viewed

File without changes

images/box_mask_box_0.jpg ADDED Viewed

images/box_mask_box_1.jpg ADDED Viewed

images/box_mask_box_2.jpg ADDED Viewed

images/box_mask_box_3.jpg ADDED Viewed

images/box_mask_ovlap_box_0.jpg ADDED Viewed

Git LFS Details

SHA256: 197f10e18d13510b96fe6da74b1a1d2197027701e0099b24c0dd8c1340a848f1
Pointer size: 131 Bytes
Size of remote file: 585 kB

images/box_mask_ovlap_box_1.jpg ADDED Viewed

Git LFS Details

SHA256: be1b8f90e503f2e955a88f3a48a2d68b064c3fd3a5dd0bf23f746c186b8f80ac
Pointer size: 131 Bytes
Size of remote file: 579 kB

images/box_mask_ovlap_box_2.jpg ADDED Viewed

Git LFS Details

SHA256: de277674b1b4e00b9796a0003b6efede6bd87bae13425dcd6a234e8cfea05c9d
Pointer size: 131 Bytes
Size of remote file: 583 kB

images/box_mask_ovlap_box_3.jpg ADDED Viewed

Git LFS Details

SHA256: 516eb00ad16bd4f89dd3441da2f794795adce94d3a957b8c6ee3b99671b032e5
Pointer size: 131 Bytes
Size of remote file: 581 kB

images/car.jpg ADDED Viewed

Git LFS Details

SHA256: 7073dfecb5a3ecafb6152124113163a0ea1c1c70f92999ec892b519eca63e3d3
Pointer size: 131 Bytes
Size of remote file: 168 kB

images/point_mask_ovlap_point_0.jpg ADDED Viewed

Git LFS Details

SHA256: f35318a3746d83e2a66ec33c30f96ec704f8ba4b7f78bdf094b64913551bc9df
Pointer size: 131 Bytes
Size of remote file: 579 kB

images/point_mask_ovlap_point_1.jpg ADDED Viewed

Git LFS Details

SHA256: 62b75267e082fc50c207ea66c8001f48f083010eb5f3c8190c73e29272d8c252
Pointer size: 131 Bytes
Size of remote file: 579 kB

images/point_mask_ovlap_point_2.jpg ADDED Viewed

Git LFS Details

SHA256: 28d8fb43f9de17cdfc71456daffa9a4a1ed134cfb5c0022bf1beedfd4fb15566
Pointer size: 131 Bytes
Size of remote file: 572 kB

images/point_mask_point_0.jpg ADDED Viewed

images/point_mask_point_1.jpg ADDED Viewed

images/point_mask_point_2.jpg ADDED Viewed

images/test.jpg ADDED Viewed

Git LFS Details

SHA256: e7c4b752ef447bfec409888cea8709be15c01d0f6bf91bd16b7762deb90950dc
Pointer size: 131 Bytes
Size of remote file: 325 kB

images/truck.jpg ADDED Viewed

Git LFS Details

SHA256: 941715e721c8864324a1425b445ea4dde0498b995c45ddce0141a58971c6ff99
Pointer size: 131 Bytes
Size of remote file: 271 kB

onnx/mobile_sam_decoder_slim.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7587d54e1df69e90573ad8b5672686e7ae8a295761fd3fd5d7ea45710ff92bb
+size 20593937

onnx/mobile_sam_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e35f2d687beafe0e327d4197ba015bca22a73142c65899572278d04fbecb149
+size 27996273

python_ax/__pycache__/sam_decoder.cpython-312.pyc ADDED Viewed

Binary file (3.37 kB). View file

python_ax/__pycache__/sam_decoder.cpython-313.pyc ADDED Viewed

Binary file (3.49 kB). View file

python_ax/__pycache__/sam_encoder.cpython-312.pyc ADDED Viewed

Binary file (3.32 kB). View file

python_ax/__pycache__/sam_encoder.cpython-313.pyc ADDED Viewed

Binary file (3.27 kB). View file

python_ax/main.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from sam_encoder import SAMEncoder
+from sam_decoder import SAMDecoder
+import cv2
+import numpy as np
+import argparse
+import os
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_path", "-i", type=str, default="../images/test.jpg", help="input image path")
+    parser.add_argument("--output_dir", "-o", type=str, default="./output", help="result path")
+    parser.add_argument("--chip", "-c", type=str, default="650", help="650 or 620E")
+    args = parser.parse_args()
+    encoder = SAMEncoder(f"../ax_model/mobile_sam_encoder_{args.chip}.axmodel")
+    decoder = SAMDecoder(f"../ax_model/mobile_sam_decoder_{args.chip}.axmodel")
+    image = cv2.imread(args.img_path)
+    h, w, _ = image.shape
+    image_embedding, scale = encoder.encode(image)
+    # test.jpg
+    point0 = (910, 641)
+    point1 = (1488, 607)
+    point2 = (579, 704)
+    # truck.jpg
+    # point0 = (500, 375)
+    os.makedirs(args.output_dir, exist_ok=True)
+    for i, point in enumerate([point0, point1, point2]):
+        image_draw = image.copy()
+        output = decoder.decode(image_embedding[0], point = point,scale = scale)
+        idx = output[0].argmax()
+        image_draw = cv2.circle(image_draw, (int(point[0]), int(point[1])), 10, (0,255,0), -1)
+        mask = output[1][:,idx,:,:][0]
+        mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
+        mask_mat[mask>0] = 255
+        mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
+        mask_mat = mask_mat[:h, :w]
+        cv2.imwrite(f"{args.output_dir}/point_mask_point_{i}.jpg", mask_mat)
+        mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
+        mask_ovlap[mask_mat>0] = [0, 255, 0]
+        image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
+        cv2.imwrite(f"{args.output_dir}/point_mask_ovlap_point_{i}.jpg", image_ovlap)
+    # for i in range(4):
+    #     mask = output[1][:,i,:,:][0]
+    #     mask_mat = np.zeros((mask.shape[0], mask.shape[1], 3), dtype=np.uint8)
+    #     mask_mat[mask>0] = 255
+    #     mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)))
+    #     mask_mat = mask_mat[:h, :w,:]
+    #     cv2.imwrite(f"./output_ax/point_mask_{i}.jpg", mask_mat)
+    # box: topleft x, topleft y, width, height
+    # test.jpg
+    box0 = (910 - 160, 641 - 430, 380, 940)
+    box1 = (479, 482, 191, 518)
+    box2 = (1345, 333, 289, 701)
+    box3 = (1, 357, 311, 751)
+    # truck.jpg
+    # box0 = (1375, 550, 1650 - 1375, 800 - 550)
+    # box1 = (75, 275, 1725 - 75, 850 - 275)
+    # box2 = (425, 600, 700 - 425, 875 - 600)
+    # box3 = (1240, 675, 1400 - 1240, 750 - 675)
+    # car.jpg
+    # box0 = (450, 170, 520 - 450, 350 - 170)
+    # box1 = (350, 190, 450 - 350, 350 - 190)
+    # box2 = (500, 170, 580 - 500, 350 - 170)
+    # box3 = (580, 170, 640 - 580, 350 - 170)
+    for i, box in enumerate([box0, box1, box2, box3]):
+        image_draw = image.copy()
+        output = decoder.decode(image_embedding[0], box = box,scale = scale)
+        idx = output[0].argmax()
+        image_draw = cv2.rectangle(image_draw, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0,255,0), 2)
+        # cv2.imwrite(f"{args.output_dir}/box_image_{i}.jpg", image)
+        mask = output[1][:,idx,:,:][0]
+        mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
+        mask_mat[mask>0] = 255
+        mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
+        mask_mat = mask_mat[:h, :w]
+        cv2.imwrite(f"{args.output_dir}/box_mask_box_{i}.jpg", mask_mat)
+        mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
+        mask_ovlap[mask_mat>0] = [0, 255, 0]
+        image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
+        cv2.imwrite(f"{args.output_dir}/box_mask_ovlap_box_{i}.jpg", image_ovlap)
+    # for i in range(4):
+    #     mask = output[1][:,i,:,:][0]
+    #     mask_mat = np.zeros((mask.shape[0], mask.shape[1], 3), dtype=np.uint8)
+    #     mask_mat[mask>0] = 255
+    #     mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)))
+    #     mask_mat = mask_mat[:h, :w,:]
+    #     cv2.imwrite(f"./output_ax/box_mask_{i}.jpg", mask_mat)

python_ax/sam_decoder.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import numpy as np
+import axengine as axe
+class SAMDecoder:
+    def __init__(self, model_path):
+        self.sess = axe.InferenceSession(model_path)
+        for input in self.sess.get_inputs():
+            print(input.name, input.shape)
+        for output in self.sess.get_outputs():
+            print(output.name, output.shape)
+        self.mask = np.zeros((1, 1, 256, 256), np.float32)
+        self.has_mask = np.array([0], np.float32)
+    def decode(self, image_embedding, point = None, box = None, scale = None):
+        if point is not None:
+            point = np.array(point).astype(np.float32) * scale
+            point_coords = np.array([point, (0,0), (0,0), (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
+            point_labels = np.array([1, 0, 0, 0, 0], np.float32).reshape((1, -1))
+        elif box is not None:
+            box = np.array(box).astype(np.float32)*scale
+            x, y, w, h = box
+            center = np.array([x + w/2, y + h/2], np.float32)
+            topleft = np.array([x, y], np.float32)
+            bottomright = np.array([x + w, y + h], np.float32)
+            point_coords = np.array([center, topleft, bottomright, (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
+            point_labels = np.array([1, 2, 3, 0, 0], np.float32).reshape((1, -1))
+        else:
+            raise ValueError("Either point or box must be provided.")
+        inputs = {
+            "image_embeddings": image_embedding,
+            "point_coords": point_coords,
+            "point_labels": point_labels,
+            "mask_input": self.mask,
+            "has_mask_input": self.has_mask,
+        }
+        outputs = self.sess.run(None, inputs)
+        return outputs

python_ax/sam_encoder.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import axengine
+import cv2
+import numpy as np
+class SAMEncoder:
+    def __init__(self,model_path):
+        self.sess = axengine.InferenceSession(model_path)
+        for input in self.sess.get_inputs():
+            print(input.name, input.shape)
+        for output in self.sess.get_outputs():
+            print(output.name, output.shape)
+        self.input_shape = (1024, 1024)
+    def letterbox(self, image, target_size, color=(114, 114, 114)):
+        """
+        将图像调整为目标大小，同时保持原始长宽比，并填充空白区域。
+        :param image: 输入图像 (H, W, C)
+        :param target_size: 目标尺寸 (width, height)
+        :param color: 填充颜色 (B, G, R)
+        :return: 调整后的图像，缩放比例，填充区域
+        """
+        original_height, original_width = image.shape[:2]
+        target_width, target_height = target_size
+        # 计算缩放比例
+        scale = min(target_width / original_width, target_height / original_height)
+        new_width = int(original_width * scale)
+        new_height = int(original_height * scale)
+        # 调整图像大小
+        resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
+        # 计算填充
+        pad_width = (target_width - new_width) // 2
+        pad_height = (target_height - new_height) // 2
+        # 填充图像
+        padded_image = cv2.copyMakeBorder(
+            resized_image,
+            0 , target_height - new_height ,
+            0, target_width - new_width ,
+            cv2.BORDER_CONSTANT,
+            value=color
+        )
+        return padded_image, scale, (pad_width, pad_height)
+    def preprocess(self,image):
+        padded_image, scale, (pad_width, pad_height) = self.letterbox(image, self.input_shape)
+        padded_image = cv2.cvtColor(padded_image, cv2.COLOR_BGR2RGB)
+        padded_image = np.expand_dims(padded_image, axis=0)
+        return padded_image, scale
+    def encode(self,image):
+        padded_image, scale = self.preprocess(image)
+        # mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape((1,1,3))
+        # std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape((1,1,3))
+        # padded_image = (padded_image.astype(np.float32)/255 - mean)/std
+        # padded_image = np.transpose(padded_image, (2, 0, 1))
+        # padded_image = np.expand_dims(padded_image, axis=0)
+        return self.sess.run(None,{self.sess.get_inputs()[0].name:padded_image}), scale

python_onnx/__pycache__/sam_decoder.cpython-312.pyc ADDED Viewed

Binary file (2.99 kB). View file

python_onnx/__pycache__/sam_decoder.cpython-313.pyc ADDED Viewed

Binary file (3.07 kB). View file

python_onnx/__pycache__/sam_encoder.cpython-312.pyc ADDED Viewed

Binary file (3.56 kB). View file

python_onnx/__pycache__/sam_encoder.cpython-313.pyc ADDED Viewed

Binary file (3.49 kB). View file

python_onnx/main.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from sam_encoder import SAMEncoder
+from sam_decoder import SAMDecoder
+import cv2
+import numpy as np
+import argparse
+import os
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_path", "-i", type=str, default="../images/test.jpg", help="input image path")
+    parser.add_argument("--output_dir", "-o", type=str, default="./output", help="result path")
+    args = parser.parse_args()
+    encoder = SAMEncoder(f"../onnx/mobile_sam_encoder.onnx")
+    decoder = SAMDecoder(f"../onnx/mobile_sam_decoder_slim.onnx")
+    image = cv2.imread(args.img_path)
+    h, w, _ = image.shape
+    image_embedding, scale = encoder.encode(image)
+    print("Scale:", scale)
+    point0 = (910, 641)
+    point1 = (1488, 607)
+    point2 = (579, 704)
+    os.makedirs(args.output_dir, exist_ok=True)
+    for i, point in enumerate([point0, point1, point2]):
+        image_draw = image.copy()
+        output = decoder.decode(image_embedding[0], point = point,scale = scale)
+        idx = output[0].argmax()
+        image_draw = cv2.circle(image_draw, (int(point[0]), int(point[1])), 10, (0,255,0), -1)
+        mask = output[1][:,idx,:,:][0]
+        mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
+        mask_mat[mask>0] = 255
+        mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
+        mask_mat = mask_mat[:h, :w]
+        cv2.imwrite(f"{args.output_dir}/point_mask_point_{i}.jpg", mask_mat)
+        mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
+        mask_ovlap[mask_mat>0] = [0, 255, 0]
+        image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
+        cv2.imwrite(f"{args.output_dir}/point_mask_ovlap_point_{i}.jpg", image_ovlap)
+    box0 = (910 - 160, 641 - 430, 380, 940)
+    box1 = (479, 482, 191, 518)
+    box2 = (1345, 333, 289, 701)
+    box3 = (1, 357, 311, 751)
+    for i, box in enumerate([box0, box1, box2, box3]):
+        image_draw = image.copy()
+        output = decoder.decode(image_embedding[0], box = box,scale = scale)
+        idx = output[0].argmax()
+        image_draw = cv2.rectangle(image_draw, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0,255,0), 2)
+        # cv2.imwrite(f"{args.output_dir}/box_image_{i}.jpg", image)
+        mask = output[1][:,idx,:,:][0]
+        mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
+        mask_mat[mask>0] = 255
+        mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
+        mask_mat = mask_mat[:h, :w]
+        cv2.imwrite(f"{args.output_dir}/box_mask_box_{i}.jpg", mask_mat)
+        mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
+        mask_ovlap[mask_mat>0] = [0, 255, 0]
+        image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
+        cv2.imwrite(f"{args.output_dir}/box_mask_ovlap_box_{i}.jpg", image_ovlap)

python_onnx/sam_decoder.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import onnxruntime
+import cv2
+import numpy as np
+class SAMDecoder:
+    def __init__(self, model_path):
+        self.sess = onnxruntime.InferenceSession(model_path)
+        self.mask = np.zeros((1, 1, 256, 256), np.float32)
+        self.has_mask = np.array([0], np.float32)
+    def decode(self, image_embedding, point = None, box = None, scale = None):
+        if point is not None:
+            point = np.array(point).astype(np.float32) * scale
+            point_coords = np.array([point, (0,0), (0,0), (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
+            point_labels = np.array([1, 0, 0, 0, 0], np.float32).reshape((1, -1))
+        elif box is not None:
+            box = np.array(box).astype(np.float32)*scale
+            x, y, w, h = box
+            center = np.array([x + w/2, y + h/2], np.float32)
+            topleft = np.array([x, y], np.float32)
+            bottomright = np.array([x + w, y + h], np.float32)
+            point_coords = np.array([center, topleft, bottomright, (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
+            point_labels = np.array([1, 2, 3, 0, 0], np.float32).reshape((1, -1))
+        else:
+            raise ValueError("Either point or box must be provided.")
+        inputs = {
+            "image_embeddings": image_embedding,
+            "point_coords": point_coords,
+            "point_labels": point_labels,
+            "mask_input": self.mask,
+            "has_mask_input": self.has_mask,
+        }
+        outputs = self.sess.run(None, inputs)
+        return outputs

python_onnx/sam_encoder.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import onnxruntime
+import cv2
+import numpy as np
+class SAMEncoder:
+    def __init__(self,model_path):
+        self.sess = onnxruntime.InferenceSession(model_path)
+        self.input_shape = (1024, 1024)
+    def letterbox(self, image, target_size, color=(114, 114, 114)):
+        """
+        将图像调整为目标大小，同时保持原始长宽比，并填充空白区域。
+        :param image: 输入图像 (H, W, C)
+        :param target_size: 目标尺寸 (width, height)
+        :param color: 填充颜色 (B, G, R)
+        :return: 调整后的图像，缩放比例，填充区域
+        """
+        original_height, original_width = image.shape[:2]
+        target_width, target_height = target_size
+        # 计算缩放比例
+        scale = min(target_width / original_width, target_height / original_height)
+        new_width = int(original_width * scale)
+        new_height = int(original_height * scale)
+        # 调整图像大小
+        resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
+        # 计算填充
+        pad_width = (target_width - new_width) // 2
+        pad_height = (target_height - new_height) // 2
+        # 填充图像
+        padded_image = cv2.copyMakeBorder(
+            resized_image,
+            0 , target_height - new_height ,
+            0, target_width - new_width ,
+            cv2.BORDER_CONSTANT,
+            value=color
+        )
+        return padded_image, scale, (pad_width, pad_height)
+    def preprocess(self,image):
+        padded_image, scale, (pad_width, pad_height) = self.letterbox(image, self.input_shape)
+        padded_image = cv2.cvtColor(padded_image, cv2.COLOR_BGR2RGB)
+        return padded_image, scale
+    def encode(self,image):
+        padded_image, scale = self.preprocess(image)
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape((1,1,3))
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape((1,1,3))
+        padded_image = (padded_image.astype(np.float32)/255 - mean)/std
+        padded_image = np.transpose(padded_image, (2, 0, 1))
+        padded_image = np.expand_dims(padded_image, axis=0)
+        return self.sess.run(None,{self.sess.get_inputs()[0].name:padded_image}), scale

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ numpy
2	+ opencv-python