Upload folder using huggingface_hub
Browse files- .gitattributes +12 -0
- README.md +46 -3
- ax_model/mobile_sam_decoder_620E.axmodel +3 -0
- ax_model/mobile_sam_decoder_650.axmodel +3 -0
- ax_model/mobile_sam_encoder_620E.axmodel +3 -0
- ax_model/mobile_sam_encoder_650.axmodel +3 -0
- config.json +0 -0
- images/box_mask_box_0.jpg +0 -0
- images/box_mask_box_1.jpg +0 -0
- images/box_mask_box_2.jpg +0 -0
- images/box_mask_box_3.jpg +0 -0
- images/box_mask_ovlap_box_0.jpg +3 -0
- images/box_mask_ovlap_box_1.jpg +3 -0
- images/box_mask_ovlap_box_2.jpg +3 -0
- images/box_mask_ovlap_box_3.jpg +3 -0
- images/car.jpg +3 -0
- images/point_mask_ovlap_point_0.jpg +3 -0
- images/point_mask_ovlap_point_1.jpg +3 -0
- images/point_mask_ovlap_point_2.jpg +3 -0
- images/point_mask_point_0.jpg +0 -0
- images/point_mask_point_1.jpg +0 -0
- images/point_mask_point_2.jpg +0 -0
- images/test.jpg +3 -0
- images/truck.jpg +3 -0
- onnx/mobile_sam_decoder_slim.onnx +3 -0
- onnx/mobile_sam_encoder.onnx +3 -0
- python_ax/__pycache__/sam_decoder.cpython-312.pyc +0 -0
- python_ax/__pycache__/sam_decoder.cpython-313.pyc +0 -0
- python_ax/__pycache__/sam_encoder.cpython-312.pyc +0 -0
- python_ax/__pycache__/sam_encoder.cpython-313.pyc +0 -0
- python_ax/main.py +99 -0
- python_ax/sam_decoder.py +40 -0
- python_ax/sam_encoder.py +67 -0
- python_onnx/__pycache__/sam_decoder.cpython-312.pyc +0 -0
- python_onnx/__pycache__/sam_decoder.cpython-313.pyc +0 -0
- python_onnx/__pycache__/sam_encoder.cpython-312.pyc +0 -0
- python_onnx/__pycache__/sam_encoder.cpython-313.pyc +0 -0
- python_onnx/main.py +70 -0
- python_onnx/sam_decoder.py +38 -0
- python_onnx/sam_encoder.py +62 -0
- requirements.txt +2 -0
.gitattributes
CHANGED
|
@@ -35,3 +35,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
ax_model/mobile_sam_decoder_620E.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
ax_model/mobile_sam_decoder_650.axmodel filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
ax_model/mobile_sam_decoder_620E.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
ax_model/mobile_sam_decoder_650.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
ax_model/mobile_sam_encoder_620E.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
ax_model/mobile_sam_encoder_650.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
images/box_mask_ovlap_box_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
images/box_mask_ovlap_box_1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
images/box_mask_ovlap_box_2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
images/box_mask_ovlap_box_3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
images/car.jpg filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
images/point_mask_ovlap_point_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
images/point_mask_ovlap_point_1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
images/point_mask_ovlap_point_2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
images/test.jpg filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
images/truck.jpg filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,46 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MobileSAM
|
| 2 |
+
基于MobileSAM的图像分割Pipeline,支持多种输入提示(框、点、掩码),支持650N/620E系列平台的模型推理。
|
| 3 |
+
|
| 4 |
+
支持芯片:
|
| 5 |
+
- AX650N
|
| 6 |
+
- AX620E
|
| 7 |
+
- AX630C
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
支持硬件
|
| 11 |
+
|
| 12 |
+
- [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
|
| 13 |
+
- [M.2 Accelerator card](https://docs.m5stack.com/zh_CN/ai_hardware/LLM-8850_Card)
|
| 14 |
+
|
| 15 |
+
原始模型请参考
|
| 16 |
+
- [MobileSAM Github](https://github.com/ChaoningZhang/MobileSAM)
|
| 17 |
+
|
| 18 |
+
## 性能对比
|
| 19 |
+
|
| 20 |
+
- 输入图片大小 1024x1024
|
| 21 |
+
|
| 22 |
+
|Chip| Models | Latency (ms) | CMM Usage (MiB) |
|
| 23 |
+
|----| --------------------- | ---------------------- | -------------- |
|
| 24 |
+
|650N| mobile_sam_encoder |49.495 | 48.334 |
|
| 25 |
+
|650N| mobile_sam_encoder |520.044 | 63.231 |
|
| 26 |
+
|620E| mobile_sam_decoder |9.930 | 16.703 |
|
| 27 |
+
|620E| mobile_sam_decoder |36.382 | 14.970 |
|
| 28 |
+
|
| 29 |
+
## 模型转换
|
| 30 |
+
- 模型转换工具链[Pulsar2](https://huggingface.co/AXERA-TECH/Pulsar2)
|
| 31 |
+
- 转换文档[Model Convert](https://github.com/AXERA-TECH/MobileSAM.axera/tree/master/convert)
|
| 32 |
+
|
| 33 |
+
## 环境准备
|
| 34 |
+
- NPU Python API: [pyaxengine](https://github.com/AXERA-TECH/pyaxengine)
|
| 35 |
+
|
| 36 |
+
安装需要的python库
|
| 37 |
+
```pip install -r requirements.txt```
|
| 38 |
+
|
| 39 |
+
## 运行
|
| 40 |
+
```shell
|
| 41 |
+
cd python_ax
|
| 42 |
+
python3 main.py -i ../images/test.jpg -c 650
|
| 43 |
+
```
|
| 44 |
+
output:
|
| 45 |
+

|
| 46 |
+

|
ax_model/mobile_sam_decoder_620E.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f968ca9f3cb313de45cc8519b62dcae2c834613f58abce85a14f8d7cd8cc7fd
|
| 3 |
+
size 7049493
|
ax_model/mobile_sam_decoder_650.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bfa87a19cfa3a95518d789cc1e5684b1d8d3c8d19b4e1e8155de5b5f5627020e
|
| 3 |
+
size 7031379
|
ax_model/mobile_sam_encoder_620E.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e941235ea497f63d91176cb6dfe856e52ccfcf6ec26157d75bcbd43aa31af851
|
| 3 |
+
size 25503415
|
ax_model/mobile_sam_encoder_650.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88ad4b13478d7adfb6db44661933e5273207dfb15c1a74588fda860944501f9f
|
| 3 |
+
size 16467039
|
config.json
ADDED
|
File without changes
|
images/box_mask_box_0.jpg
ADDED
|
images/box_mask_box_1.jpg
ADDED
|
images/box_mask_box_2.jpg
ADDED
|
images/box_mask_box_3.jpg
ADDED
|
images/box_mask_ovlap_box_0.jpg
ADDED
|
Git LFS Details
|
images/box_mask_ovlap_box_1.jpg
ADDED
|
Git LFS Details
|
images/box_mask_ovlap_box_2.jpg
ADDED
|
Git LFS Details
|
images/box_mask_ovlap_box_3.jpg
ADDED
|
Git LFS Details
|
images/car.jpg
ADDED
|
Git LFS Details
|
images/point_mask_ovlap_point_0.jpg
ADDED
|
Git LFS Details
|
images/point_mask_ovlap_point_1.jpg
ADDED
|
Git LFS Details
|
images/point_mask_ovlap_point_2.jpg
ADDED
|
Git LFS Details
|
images/point_mask_point_0.jpg
ADDED
|
images/point_mask_point_1.jpg
ADDED
|
images/point_mask_point_2.jpg
ADDED
|
images/test.jpg
ADDED
|
Git LFS Details
|
images/truck.jpg
ADDED
|
Git LFS Details
|
onnx/mobile_sam_decoder_slim.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7587d54e1df69e90573ad8b5672686e7ae8a295761fd3fd5d7ea45710ff92bb
|
| 3 |
+
size 20593937
|
onnx/mobile_sam_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e35f2d687beafe0e327d4197ba015bca22a73142c65899572278d04fbecb149
|
| 3 |
+
size 27996273
|
python_ax/__pycache__/sam_decoder.cpython-312.pyc
ADDED
|
Binary file (3.37 kB). View file
|
|
|
python_ax/__pycache__/sam_decoder.cpython-313.pyc
ADDED
|
Binary file (3.49 kB). View file
|
|
|
python_ax/__pycache__/sam_encoder.cpython-312.pyc
ADDED
|
Binary file (3.32 kB). View file
|
|
|
python_ax/__pycache__/sam_encoder.cpython-313.pyc
ADDED
|
Binary file (3.27 kB). View file
|
|
|
python_ax/main.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sam_encoder import SAMEncoder
|
| 2 |
+
from sam_decoder import SAMDecoder
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
import argparse
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
if __name__ == "__main__":
|
| 9 |
+
|
| 10 |
+
parser = argparse.ArgumentParser()
|
| 11 |
+
parser.add_argument("--img_path", "-i", type=str, default="../images/test.jpg", help="input image path")
|
| 12 |
+
parser.add_argument("--output_dir", "-o", type=str, default="./output", help="result path")
|
| 13 |
+
parser.add_argument("--chip", "-c", type=str, default="650", help="650 or 620E")
|
| 14 |
+
args = parser.parse_args()
|
| 15 |
+
|
| 16 |
+
encoder = SAMEncoder(f"../ax_model/mobile_sam_encoder_{args.chip}.axmodel")
|
| 17 |
+
decoder = SAMDecoder(f"../ax_model/mobile_sam_decoder_{args.chip}.axmodel")
|
| 18 |
+
|
| 19 |
+
image = cv2.imread(args.img_path)
|
| 20 |
+
h, w, _ = image.shape
|
| 21 |
+
image_embedding, scale = encoder.encode(image)
|
| 22 |
+
|
| 23 |
+
# test.jpg
|
| 24 |
+
point0 = (910, 641)
|
| 25 |
+
point1 = (1488, 607)
|
| 26 |
+
point2 = (579, 704)
|
| 27 |
+
# truck.jpg
|
| 28 |
+
# point0 = (500, 375)
|
| 29 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
for i, point in enumerate([point0, point1, point2]):
|
| 32 |
+
image_draw = image.copy()
|
| 33 |
+
|
| 34 |
+
output = decoder.decode(image_embedding[0], point = point,scale = scale)
|
| 35 |
+
idx = output[0].argmax()
|
| 36 |
+
|
| 37 |
+
image_draw = cv2.circle(image_draw, (int(point[0]), int(point[1])), 10, (0,255,0), -1)
|
| 38 |
+
mask = output[1][:,idx,:,:][0]
|
| 39 |
+
mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
|
| 40 |
+
mask_mat[mask>0] = 255
|
| 41 |
+
mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
|
| 42 |
+
mask_mat = mask_mat[:h, :w]
|
| 43 |
+
cv2.imwrite(f"{args.output_dir}/point_mask_point_{i}.jpg", mask_mat)
|
| 44 |
+
mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
|
| 45 |
+
mask_ovlap[mask_mat>0] = [0, 255, 0]
|
| 46 |
+
image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
|
| 47 |
+
cv2.imwrite(f"{args.output_dir}/point_mask_ovlap_point_{i}.jpg", image_ovlap)
|
| 48 |
+
|
| 49 |
+
# for i in range(4):
|
| 50 |
+
# mask = output[1][:,i,:,:][0]
|
| 51 |
+
# mask_mat = np.zeros((mask.shape[0], mask.shape[1], 3), dtype=np.uint8)
|
| 52 |
+
# mask_mat[mask>0] = 255
|
| 53 |
+
# mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)))
|
| 54 |
+
# mask_mat = mask_mat[:h, :w,:]
|
| 55 |
+
# cv2.imwrite(f"./output_ax/point_mask_{i}.jpg", mask_mat)
|
| 56 |
+
|
| 57 |
+
# box: topleft x, topleft y, width, height
|
| 58 |
+
# test.jpg
|
| 59 |
+
box0 = (910 - 160, 641 - 430, 380, 940)
|
| 60 |
+
box1 = (479, 482, 191, 518)
|
| 61 |
+
box2 = (1345, 333, 289, 701)
|
| 62 |
+
box3 = (1, 357, 311, 751)
|
| 63 |
+
# truck.jpg
|
| 64 |
+
# box0 = (1375, 550, 1650 - 1375, 800 - 550)
|
| 65 |
+
# box1 = (75, 275, 1725 - 75, 850 - 275)
|
| 66 |
+
# box2 = (425, 600, 700 - 425, 875 - 600)
|
| 67 |
+
# box3 = (1240, 675, 1400 - 1240, 750 - 675)
|
| 68 |
+
# car.jpg
|
| 69 |
+
# box0 = (450, 170, 520 - 450, 350 - 170)
|
| 70 |
+
# box1 = (350, 190, 450 - 350, 350 - 190)
|
| 71 |
+
# box2 = (500, 170, 580 - 500, 350 - 170)
|
| 72 |
+
# box3 = (580, 170, 640 - 580, 350 - 170)
|
| 73 |
+
for i, box in enumerate([box0, box1, box2, box3]):
|
| 74 |
+
image_draw = image.copy()
|
| 75 |
+
output = decoder.decode(image_embedding[0], box = box,scale = scale)
|
| 76 |
+
idx = output[0].argmax()
|
| 77 |
+
|
| 78 |
+
image_draw = cv2.rectangle(image_draw, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0,255,0), 2)
|
| 79 |
+
# cv2.imwrite(f"{args.output_dir}/box_image_{i}.jpg", image)
|
| 80 |
+
|
| 81 |
+
mask = output[1][:,idx,:,:][0]
|
| 82 |
+
mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
|
| 83 |
+
mask_mat[mask>0] = 255
|
| 84 |
+
mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
|
| 85 |
+
mask_mat = mask_mat[:h, :w]
|
| 86 |
+
cv2.imwrite(f"{args.output_dir}/box_mask_box_{i}.jpg", mask_mat)
|
| 87 |
+
mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
|
| 88 |
+
mask_ovlap[mask_mat>0] = [0, 255, 0]
|
| 89 |
+
image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
|
| 90 |
+
cv2.imwrite(f"{args.output_dir}/box_mask_ovlap_box_{i}.jpg", image_ovlap)
|
| 91 |
+
|
| 92 |
+
# for i in range(4):
|
| 93 |
+
# mask = output[1][:,i,:,:][0]
|
| 94 |
+
# mask_mat = np.zeros((mask.shape[0], mask.shape[1], 3), dtype=np.uint8)
|
| 95 |
+
# mask_mat[mask>0] = 255
|
| 96 |
+
# mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)))
|
| 97 |
+
# mask_mat = mask_mat[:h, :w,:]
|
| 98 |
+
# cv2.imwrite(f"./output_ax/box_mask_{i}.jpg", mask_mat)
|
| 99 |
+
|
python_ax/sam_decoder.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import axengine as axe
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class SAMDecoder:
|
| 6 |
+
|
| 7 |
+
def __init__(self, model_path):
|
| 8 |
+
self.sess = axe.InferenceSession(model_path)
|
| 9 |
+
for input in self.sess.get_inputs():
|
| 10 |
+
print(input.name, input.shape)
|
| 11 |
+
for output in self.sess.get_outputs():
|
| 12 |
+
print(output.name, output.shape)
|
| 13 |
+
|
| 14 |
+
self.mask = np.zeros((1, 1, 256, 256), np.float32)
|
| 15 |
+
self.has_mask = np.array([0], np.float32)
|
| 16 |
+
|
| 17 |
+
def decode(self, image_embedding, point = None, box = None, scale = None):
|
| 18 |
+
if point is not None:
|
| 19 |
+
point = np.array(point).astype(np.float32) * scale
|
| 20 |
+
point_coords = np.array([point, (0,0), (0,0), (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
|
| 21 |
+
point_labels = np.array([1, 0, 0, 0, 0], np.float32).reshape((1, -1))
|
| 22 |
+
elif box is not None:
|
| 23 |
+
box = np.array(box).astype(np.float32)*scale
|
| 24 |
+
x, y, w, h = box
|
| 25 |
+
center = np.array([x + w/2, y + h/2], np.float32)
|
| 26 |
+
topleft = np.array([x, y], np.float32)
|
| 27 |
+
bottomright = np.array([x + w, y + h], np.float32)
|
| 28 |
+
point_coords = np.array([center, topleft, bottomright, (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
|
| 29 |
+
point_labels = np.array([1, 2, 3, 0, 0], np.float32).reshape((1, -1))
|
| 30 |
+
else:
|
| 31 |
+
raise ValueError("Either point or box must be provided.")
|
| 32 |
+
inputs = {
|
| 33 |
+
"image_embeddings": image_embedding,
|
| 34 |
+
"point_coords": point_coords,
|
| 35 |
+
"point_labels": point_labels,
|
| 36 |
+
"mask_input": self.mask,
|
| 37 |
+
"has_mask_input": self.has_mask,
|
| 38 |
+
}
|
| 39 |
+
outputs = self.sess.run(None, inputs)
|
| 40 |
+
return outputs
|
python_ax/sam_encoder.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import axengine
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
class SAMEncoder:
|
| 6 |
+
def __init__(self,model_path):
|
| 7 |
+
self.sess = axengine.InferenceSession(model_path)
|
| 8 |
+
for input in self.sess.get_inputs():
|
| 9 |
+
print(input.name, input.shape)
|
| 10 |
+
for output in self.sess.get_outputs():
|
| 11 |
+
print(output.name, output.shape)
|
| 12 |
+
self.input_shape = (1024, 1024)
|
| 13 |
+
|
| 14 |
+
def letterbox(self, image, target_size, color=(114, 114, 114)):
|
| 15 |
+
"""
|
| 16 |
+
将图像调整为目标大小,同时保持原始长宽比,并填充空白区域。
|
| 17 |
+
|
| 18 |
+
:param image: 输入图像 (H, W, C)
|
| 19 |
+
:param target_size: 目标尺寸 (width, height)
|
| 20 |
+
:param color: 填充颜色 (B, G, R)
|
| 21 |
+
:return: 调整后的图像,缩放比例,填充区域
|
| 22 |
+
"""
|
| 23 |
+
original_height, original_width = image.shape[:2]
|
| 24 |
+
target_width, target_height = target_size
|
| 25 |
+
|
| 26 |
+
# 计算缩放比例
|
| 27 |
+
scale = min(target_width / original_width, target_height / original_height)
|
| 28 |
+
new_width = int(original_width * scale)
|
| 29 |
+
new_height = int(original_height * scale)
|
| 30 |
+
|
| 31 |
+
# 调整图像大小
|
| 32 |
+
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
|
| 33 |
+
|
| 34 |
+
# 计算填充
|
| 35 |
+
pad_width = (target_width - new_width) // 2
|
| 36 |
+
pad_height = (target_height - new_height) // 2
|
| 37 |
+
|
| 38 |
+
# 填充图像
|
| 39 |
+
padded_image = cv2.copyMakeBorder(
|
| 40 |
+
resized_image,
|
| 41 |
+
0 , target_height - new_height ,
|
| 42 |
+
0, target_width - new_width ,
|
| 43 |
+
cv2.BORDER_CONSTANT,
|
| 44 |
+
value=color
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
return padded_image, scale, (pad_width, pad_height)
|
| 48 |
+
|
| 49 |
+
def preprocess(self,image):
|
| 50 |
+
padded_image, scale, (pad_width, pad_height) = self.letterbox(image, self.input_shape)
|
| 51 |
+
|
| 52 |
+
padded_image = cv2.cvtColor(padded_image, cv2.COLOR_BGR2RGB)
|
| 53 |
+
padded_image = np.expand_dims(padded_image, axis=0)
|
| 54 |
+
return padded_image, scale
|
| 55 |
+
|
| 56 |
+
def encode(self,image):
|
| 57 |
+
padded_image, scale = self.preprocess(image)
|
| 58 |
+
|
| 59 |
+
# mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape((1,1,3))
|
| 60 |
+
# std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape((1,1,3))
|
| 61 |
+
# padded_image = (padded_image.astype(np.float32)/255 - mean)/std
|
| 62 |
+
|
| 63 |
+
# padded_image = np.transpose(padded_image, (2, 0, 1))
|
| 64 |
+
# padded_image = np.expand_dims(padded_image, axis=0)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
return self.sess.run(None,{self.sess.get_inputs()[0].name:padded_image}), scale
|
python_onnx/__pycache__/sam_decoder.cpython-312.pyc
ADDED
|
Binary file (2.99 kB). View file
|
|
|
python_onnx/__pycache__/sam_decoder.cpython-313.pyc
ADDED
|
Binary file (3.07 kB). View file
|
|
|
python_onnx/__pycache__/sam_encoder.cpython-312.pyc
ADDED
|
Binary file (3.56 kB). View file
|
|
|
python_onnx/__pycache__/sam_encoder.cpython-313.pyc
ADDED
|
Binary file (3.49 kB). View file
|
|
|
python_onnx/main.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sam_encoder import SAMEncoder
|
| 2 |
+
from sam_decoder import SAMDecoder
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
import argparse
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
if __name__ == "__main__":
|
| 9 |
+
|
| 10 |
+
parser = argparse.ArgumentParser()
|
| 11 |
+
parser.add_argument("--img_path", "-i", type=str, default="../images/test.jpg", help="input image path")
|
| 12 |
+
parser.add_argument("--output_dir", "-o", type=str, default="./output", help="result path")
|
| 13 |
+
args = parser.parse_args()
|
| 14 |
+
|
| 15 |
+
encoder = SAMEncoder(f"../onnx/mobile_sam_encoder.onnx")
|
| 16 |
+
decoder = SAMDecoder(f"../onnx/mobile_sam_decoder_slim.onnx")
|
| 17 |
+
|
| 18 |
+
image = cv2.imread(args.img_path)
|
| 19 |
+
h, w, _ = image.shape
|
| 20 |
+
image_embedding, scale = encoder.encode(image)
|
| 21 |
+
print("Scale:", scale)
|
| 22 |
+
|
| 23 |
+
point0 = (910, 641)
|
| 24 |
+
point1 = (1488, 607)
|
| 25 |
+
point2 = (579, 704)
|
| 26 |
+
|
| 27 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 28 |
+
for i, point in enumerate([point0, point1, point2]):
|
| 29 |
+
image_draw = image.copy()
|
| 30 |
+
|
| 31 |
+
output = decoder.decode(image_embedding[0], point = point,scale = scale)
|
| 32 |
+
idx = output[0].argmax()
|
| 33 |
+
|
| 34 |
+
image_draw = cv2.circle(image_draw, (int(point[0]), int(point[1])), 10, (0,255,0), -1)
|
| 35 |
+
mask = output[1][:,idx,:,:][0]
|
| 36 |
+
mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
|
| 37 |
+
mask_mat[mask>0] = 255
|
| 38 |
+
mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
|
| 39 |
+
mask_mat = mask_mat[:h, :w]
|
| 40 |
+
cv2.imwrite(f"{args.output_dir}/point_mask_point_{i}.jpg", mask_mat)
|
| 41 |
+
mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
|
| 42 |
+
mask_ovlap[mask_mat>0] = [0, 255, 0]
|
| 43 |
+
image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
|
| 44 |
+
cv2.imwrite(f"{args.output_dir}/point_mask_ovlap_point_{i}.jpg", image_ovlap)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
box0 = (910 - 160, 641 - 430, 380, 940)
|
| 48 |
+
box1 = (479, 482, 191, 518)
|
| 49 |
+
box2 = (1345, 333, 289, 701)
|
| 50 |
+
box3 = (1, 357, 311, 751)
|
| 51 |
+
|
| 52 |
+
for i, box in enumerate([box0, box1, box2, box3]):
|
| 53 |
+
image_draw = image.copy()
|
| 54 |
+
output = decoder.decode(image_embedding[0], box = box,scale = scale)
|
| 55 |
+
idx = output[0].argmax()
|
| 56 |
+
|
| 57 |
+
image_draw = cv2.rectangle(image_draw, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0,255,0), 2)
|
| 58 |
+
# cv2.imwrite(f"{args.output_dir}/box_image_{i}.jpg", image)
|
| 59 |
+
|
| 60 |
+
mask = output[1][:,idx,:,:][0]
|
| 61 |
+
mask_mat = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
|
| 62 |
+
mask_mat[mask>0] = 255
|
| 63 |
+
mask_mat = cv2.resize(mask_mat, (max(w, h),max(w, h)),interpolation=cv2.INTER_LINEAR)
|
| 64 |
+
mask_mat = mask_mat[:h, :w]
|
| 65 |
+
cv2.imwrite(f"{args.output_dir}/box_mask_box_{i}.jpg", mask_mat)
|
| 66 |
+
mask_ovlap = np.zeros((mask_mat.shape[0], mask_mat.shape[1], 3), dtype=np.uint8)
|
| 67 |
+
mask_ovlap[mask_mat>0] = [0, 255, 0]
|
| 68 |
+
image_ovlap = cv2.addWeighted(image_draw, 1, mask_ovlap, 0.5, 0)
|
| 69 |
+
cv2.imwrite(f"{args.output_dir}/box_mask_ovlap_box_{i}.jpg", image_ovlap)
|
| 70 |
+
|
python_onnx/sam_decoder.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import onnxruntime
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SAMDecoder:
|
| 7 |
+
|
| 8 |
+
def __init__(self, model_path):
|
| 9 |
+
self.sess = onnxruntime.InferenceSession(model_path)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
self.mask = np.zeros((1, 1, 256, 256), np.float32)
|
| 13 |
+
self.has_mask = np.array([0], np.float32)
|
| 14 |
+
|
| 15 |
+
def decode(self, image_embedding, point = None, box = None, scale = None):
|
| 16 |
+
if point is not None:
|
| 17 |
+
point = np.array(point).astype(np.float32) * scale
|
| 18 |
+
point_coords = np.array([point, (0,0), (0,0), (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
|
| 19 |
+
point_labels = np.array([1, 0, 0, 0, 0], np.float32).reshape((1, -1))
|
| 20 |
+
elif box is not None:
|
| 21 |
+
box = np.array(box).astype(np.float32)*scale
|
| 22 |
+
x, y, w, h = box
|
| 23 |
+
center = np.array([x + w/2, y + h/2], np.float32)
|
| 24 |
+
topleft = np.array([x, y], np.float32)
|
| 25 |
+
bottomright = np.array([x + w, y + h], np.float32)
|
| 26 |
+
point_coords = np.array([center, topleft, bottomright, (0,0), (0,0)]).astype(np.float32).reshape((1, -1, 2))
|
| 27 |
+
point_labels = np.array([1, 2, 3, 0, 0], np.float32).reshape((1, -1))
|
| 28 |
+
else:
|
| 29 |
+
raise ValueError("Either point or box must be provided.")
|
| 30 |
+
inputs = {
|
| 31 |
+
"image_embeddings": image_embedding,
|
| 32 |
+
"point_coords": point_coords,
|
| 33 |
+
"point_labels": point_labels,
|
| 34 |
+
"mask_input": self.mask,
|
| 35 |
+
"has_mask_input": self.has_mask,
|
| 36 |
+
}
|
| 37 |
+
outputs = self.sess.run(None, inputs)
|
| 38 |
+
return outputs
|
python_onnx/sam_encoder.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import onnxruntime
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
class SAMEncoder:
|
| 6 |
+
def __init__(self,model_path):
|
| 7 |
+
self.sess = onnxruntime.InferenceSession(model_path)
|
| 8 |
+
self.input_shape = (1024, 1024)
|
| 9 |
+
|
| 10 |
+
def letterbox(self, image, target_size, color=(114, 114, 114)):
|
| 11 |
+
"""
|
| 12 |
+
将图像调整为目标大小,同时保持原始长宽比,并填充空白区域。
|
| 13 |
+
|
| 14 |
+
:param image: 输入图像 (H, W, C)
|
| 15 |
+
:param target_size: 目标尺寸 (width, height)
|
| 16 |
+
:param color: 填充颜色 (B, G, R)
|
| 17 |
+
:return: 调整后的图像,缩放比例,填充区域
|
| 18 |
+
"""
|
| 19 |
+
original_height, original_width = image.shape[:2]
|
| 20 |
+
target_width, target_height = target_size
|
| 21 |
+
|
| 22 |
+
# 计算缩放比例
|
| 23 |
+
scale = min(target_width / original_width, target_height / original_height)
|
| 24 |
+
new_width = int(original_width * scale)
|
| 25 |
+
new_height = int(original_height * scale)
|
| 26 |
+
|
| 27 |
+
# 调整图像大小
|
| 28 |
+
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
|
| 29 |
+
|
| 30 |
+
# 计算填充
|
| 31 |
+
pad_width = (target_width - new_width) // 2
|
| 32 |
+
pad_height = (target_height - new_height) // 2
|
| 33 |
+
|
| 34 |
+
# 填充图像
|
| 35 |
+
padded_image = cv2.copyMakeBorder(
|
| 36 |
+
resized_image,
|
| 37 |
+
0 , target_height - new_height ,
|
| 38 |
+
0, target_width - new_width ,
|
| 39 |
+
cv2.BORDER_CONSTANT,
|
| 40 |
+
value=color
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
return padded_image, scale, (pad_width, pad_height)
|
| 44 |
+
|
| 45 |
+
def preprocess(self,image):
|
| 46 |
+
padded_image, scale, (pad_width, pad_height) = self.letterbox(image, self.input_shape)
|
| 47 |
+
|
| 48 |
+
padded_image = cv2.cvtColor(padded_image, cv2.COLOR_BGR2RGB)
|
| 49 |
+
return padded_image, scale
|
| 50 |
+
|
| 51 |
+
def encode(self,image):
|
| 52 |
+
padded_image, scale = self.preprocess(image)
|
| 53 |
+
|
| 54 |
+
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape((1,1,3))
|
| 55 |
+
std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape((1,1,3))
|
| 56 |
+
padded_image = (padded_image.astype(np.float32)/255 - mean)/std
|
| 57 |
+
|
| 58 |
+
padded_image = np.transpose(padded_image, (2, 0, 1))
|
| 59 |
+
padded_image = np.expand_dims(padded_image, axis=0)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
return self.sess.run(None,{self.sess.get_inputs()[0].name:padded_image}), scale
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy
|
| 2 |
+
opencv-python
|