File size: 4,616 Bytes
66e269f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7832c5
66e269f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1542af
 
 
 
 
 
fa35c9b
b1542af
 
 
 
 
 
 
 
 
66e269f
b1542af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66e269f
 
d7832c5
66e269f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import cv2
import onnxruntime as ort
from typing import List, Dict, Tuple

class ObjectDetector:
    def __init__(self, model_path: str, class_names: List[str], input_size: int = 640):
        self.class_names = class_names
        self.input_size = input_size
        self.session = self._load_model(model_path)
        self._warmup()

    def _load_model(self, model_path: str) -> ort.InferenceSession:
        options = ort.SessionOptions()
        options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        return ort.InferenceSession(
            model_path,
            providers=['CUDAExecutionProvider', 'CPUExecutionProvider'],
            sess_options=options
        )

    def _warmup(self):
        dummy_input = np.random.randn(1, 3, self.input_size, self.input_size).astype(np.float32)
        self.session.run(None, {"images": dummy_input})

    @staticmethod
    def compute_iou(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
        xmin = np.maximum(box[0], boxes[:, 0])
        ymin = np.maximum(box[1], boxes[:, 1])
        xmax = np.minimum(box[2], boxes[:, 2])
        ymax = np.minimum(box[3], boxes[:, 3])
        
        intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
        box_area = (box[2] - box[0]) * (box[3] - box[1])
        boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        
        return intersection_area / (box_area + boxes_area - intersection_area + 1e-6)

    @staticmethod
    def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float) -> List[int]:
        sorted_indices = np.argsort(scores)[::-1]
        keep_boxes = []
        
        while sorted_indices.size > 0:
            box_id = sorted_indices[0]
            keep_boxes.append(box_id)
            ious = ObjectDetector.compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
            keep_indices = np.where(ious < iou_threshold)[0]
            sorted_indices = sorted_indices[keep_indices + 1]
        return keep_boxes

    def preprocess(self, image: np.ndarray) -> Tuple[np.ndarray, float, Tuple[int, int]]:
        h, w = image.shape[:2]
        scale = min(self.input_size / h, self.input_size / w)
        new_h, new_w = int(h * scale), int(w * scale)
        
        resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
        canvas = np.full((self.input_size, self.input_size, 3), 114, dtype=np.uint8)
        ph, pw = (self.input_size - new_h) // 2, (self.input_size - new_w) // 2
        canvas[ph:ph+new_h, pw:pw+new_w] = resized
        
        blob = canvas.astype(np.float32) / 255.0
        return blob.transpose(2, 0, 1)[None, ...], scale, (pw, ph)

    def postprocess(
        self,
        predictions: np.ndarray,
        original_shape: Tuple[int, int],
        scale: float,
        padding: Tuple[int, int],
        conf_threshold: float = 0.3,
        iou_threshold: float = 0.45
    ) -> List[Dict]:
        predictions = np.squeeze(predictions).T
        scores = np.max(predictions[:, 4:], axis=1)
        valid = scores > conf_threshold
        predictions = predictions[valid]
        
        if predictions.size == 0:
            return []

        boxes = predictions[:, :4]
        boxes[:, [0, 1]] = boxes[:, [0, 1]] - boxes[:, [2, 3]] / 2
        boxes[:, [2, 3]] = boxes[:, [0, 1]] + boxes[:, [2, 3]]
        
        pad_w, pad_h = padding
        boxes[:, [0, 2]] = (boxes[:, [0, 2]] - pad_w) / scale
        boxes[:, [1, 3]] = (boxes[:, [1, 3]] - pad_h) / scale
        
        h, w = original_shape
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, w)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, h)
        
        class_ids = np.argmax(predictions[:, 4:], axis=1)
        indices = self.nms(boxes, scores[valid], iou_threshold)
        
        return [{
            "class": self.class_names[int(class_ids[i])],
            "confidence": float(scores[valid][i]),
            "bbox": boxes[i].tolist(),
            "bbox_normalized": [
                float((boxes[i][0] + boxes[i][2])/2 / w),
                float((boxes[i][1] + boxes[i][3])/2 / h),
                float((boxes[i][2] - boxes[i][0]) / w),
                float((boxes[i][3] - boxes[i][1]) / h)
            ]
        } for i in indices]

    def predict(self, image: np.ndarray) -> List[Dict]:
        """Main prediction method"""
        input_tensor, scale, padding = self.preprocess(image)
        outputs = self.session.run(None, {"images": input_tensor})
        return self.postprocess(outputs[0], image.shape[:2], scale, padding)