Shilpaj commited on
Commit
291ee34
·
1 Parent(s): ca44e3b

Feat: Helper files for application

Browse files
Files changed (2) hide show
  1. config.py +194 -0
  2. inference.py +192 -0
config.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Configuration file
4
+ """
5
+ # Standard Library Imports
6
+ import os
7
+
8
+ # Third-Party Imports
9
+ import cv2
10
+ import torch
11
+ import albumentations as A
12
+ from albumentations.pytorch import ToTensorV2
13
+ from utils import seed_everything
14
+
15
+
16
+ DATASET = 'PASCAL_VOC'
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+ seed_everything() # If you want deterministic behavior
19
+ NUM_WORKERS = os.cpu_count()
20
+ BATCH_SIZE = 32
21
+ IMAGE_SIZE = 416
22
+ NUM_CLASSES = 20
23
+ LEARNING_RATE = 1e-5
24
+ WEIGHT_DECAY = 1e-4
25
+ NUM_EPOCHS = 100
26
+ CONF_THRESHOLD = 0.5
27
+ MAP_IOU_THRESH = 0.5
28
+ NMS_IOU_THRESH = 0.45
29
+ S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
30
+ PIN_MEMORY = True
31
+ LOAD_MODEL = False
32
+ SAVE_MODEL = True
33
+ CHECKPOINT_FILE = "checkpoint.pth.tar"
34
+ IMG_DIR = DATASET + "/images/"
35
+ LABEL_DIR = DATASET + "/labels/"
36
+
37
+ ANCHORS = [
38
+ [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
39
+ [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
40
+ [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
41
+ ] # Note these have been rescaled to be between [0, 1]
42
+
43
+ SCALED_ANCHORS = (torch.tensor(ANCHORS) * torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)).to(DEVICE)
44
+
45
+ means = [0.485, 0.456, 0.406]
46
+
47
+ scale = 1.1
48
+ train_transforms = A.Compose(
49
+ [
50
+ A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
51
+ A.PadIfNeeded(
52
+ min_height=int(IMAGE_SIZE * scale),
53
+ min_width=int(IMAGE_SIZE * scale),
54
+ border_mode=cv2.BORDER_CONSTANT,
55
+ ),
56
+ A.Rotate(limit=10, interpolation=1, border_mode=4),
57
+ A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
58
+ A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
59
+ A.OneOf(
60
+ [
61
+ A.ShiftScaleRotate(
62
+ rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT
63
+ ),
64
+ # A.Affine(shear=15, p=0.5, mode="constant"),
65
+ ],
66
+ p=1.0,
67
+ ),
68
+ A.HorizontalFlip(p=0.5),
69
+ A.Blur(p=0.1),
70
+ A.CLAHE(p=0.1),
71
+ A.Posterize(p=0.1),
72
+ A.ToGray(p=0.1),
73
+ A.ChannelShuffle(p=0.05),
74
+ A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255, ),
75
+ ToTensorV2(),
76
+ ],
77
+ bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[], ),
78
+ )
79
+ test_transforms = A.Compose(
80
+ [
81
+ A.LongestMaxSize(max_size=IMAGE_SIZE),
82
+ A.PadIfNeeded(
83
+ min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
84
+ ),
85
+ A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255, ),
86
+ ToTensorV2(),
87
+ ],
88
+ bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
89
+ )
90
+
91
+ PASCAL_CLASSES = [
92
+ "aeroplane",
93
+ "bicycle",
94
+ "bird",
95
+ "boat",
96
+ "bottle",
97
+ "bus",
98
+ "car",
99
+ "cat",
100
+ "chair",
101
+ "cow",
102
+ "diningtable",
103
+ "dog",
104
+ "horse",
105
+ "motorbike",
106
+ "person",
107
+ "pottedplant",
108
+ "sheep",
109
+ "sofa",
110
+ "train",
111
+ "tvmonitor"
112
+ ]
113
+
114
+ COCO_LABELS = ['person',
115
+ 'bicycle',
116
+ 'car',
117
+ 'motorcycle',
118
+ 'airplane',
119
+ 'bus',
120
+ 'train',
121
+ 'truck',
122
+ 'boat',
123
+ 'traffic light',
124
+ 'fire hydrant',
125
+ 'stop sign',
126
+ 'parking meter',
127
+ 'bench',
128
+ 'bird',
129
+ 'cat',
130
+ 'dog',
131
+ 'horse',
132
+ 'sheep',
133
+ 'cow',
134
+ 'elephant',
135
+ 'bear',
136
+ 'zebra',
137
+ 'giraffe',
138
+ 'backpack',
139
+ 'umbrella',
140
+ 'handbag',
141
+ 'tie',
142
+ 'suitcase',
143
+ 'frisbee',
144
+ 'skis',
145
+ 'snowboard',
146
+ 'sports ball',
147
+ 'kite',
148
+ 'baseball bat',
149
+ 'baseball glove',
150
+ 'skateboard',
151
+ 'surfboard',
152
+ 'tennis racket',
153
+ 'bottle',
154
+ 'wine glass',
155
+ 'cup',
156
+ 'fork',
157
+ 'knife',
158
+ 'spoon',
159
+ 'bowl',
160
+ 'banana',
161
+ 'apple',
162
+ 'sandwich',
163
+ 'orange',
164
+ 'broccoli',
165
+ 'carrot',
166
+ 'hot dog',
167
+ 'pizza',
168
+ 'donut',
169
+ 'cake',
170
+ 'chair',
171
+ 'couch',
172
+ 'potted plant',
173
+ 'bed',
174
+ 'dining table',
175
+ 'toilet',
176
+ 'tv',
177
+ 'laptop',
178
+ 'mouse',
179
+ 'remote',
180
+ 'keyboard',
181
+ 'cell phone',
182
+ 'microwave',
183
+ 'oven',
184
+ 'toaster',
185
+ 'sink',
186
+ 'refrigerator',
187
+ 'book',
188
+ 'clock',
189
+ 'vase',
190
+ 'scissors',
191
+ 'teddy bear',
192
+ 'hair drier',
193
+ 'toothbrush'
194
+ ]
inference.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to perform the inference
3
+ Reference: https://huggingface.co/spaces/anantgupta129/PyTorch-YoloV3-PascolVOC-GradCAM/tree/main
4
+ """
5
+ import random
6
+ from typing import List
7
+
8
+ import cv2
9
+ import torch
10
+ import numpy as np
11
+ import albumentations as A
12
+ from albumentations.pytorch import ToTensorV2
13
+ from pytorch_grad_cam.utils.image import show_cam_on_image
14
+ from pytorch_grad_cam.base_cam import BaseCAM
15
+ from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection
16
+ from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
17
+
18
+ import config
19
+ from utils import cells_to_bboxes, non_max_suppression
20
+
21
+
22
+ IMAGE_SIZE = config.IMAGE_SIZE
23
+ scaled_anchors = config.SCALED_ANCHORS
24
+
25
+ _transforms = A.Compose(
26
+ [
27
+ A.LongestMaxSize(max_size=IMAGE_SIZE),
28
+ A.PadIfNeeded(
29
+ min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
30
+ ),
31
+ A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
32
+ ToTensorV2(),
33
+ ],
34
+ )
35
+
36
+
37
+ def draw_predictions(image: np.ndarray, boxes: List[List], class_labels: List[str]) -> np.ndarray:
38
+ """Plots predicted bounding boxes on the image"""
39
+
40
+ colors = [[random.randint(0, 255) for _ in range(3)] for name in class_labels]
41
+
42
+ im = np.array(image)
43
+ height, width, _ = im.shape
44
+ bbox_thick = int(0.6 * (height + width) / 600)
45
+
46
+ # Create a Rectangle patch
47
+ for box in boxes:
48
+ assert len(box) == 6, "box should contain class pred, confidence, x, y, width, height"
49
+ class_pred = box[0]
50
+ conf = box[1]
51
+ box = box[2:]
52
+ upper_left_x = box[0] - box[2] / 2
53
+ upper_left_y = box[1] - box[3] / 2
54
+
55
+ x1 = int(upper_left_x * width)
56
+ y1 = int(upper_left_y * height)
57
+
58
+ x2 = x1 + int(box[2] * width)
59
+ y2 = y1 + int(box[3] * height)
60
+
61
+ cv2.rectangle(
62
+ image,
63
+ (x1, y1), (x2, y2),
64
+ color=colors[int(class_pred)],
65
+ thickness=bbox_thick
66
+ )
67
+ text = f"{class_labels[int(class_pred)]}: {conf:.2f}"
68
+ t_size = cv2.getTextSize(text, 0, 0.7, thickness=bbox_thick // 2)[0]
69
+ c3 = (x1 + t_size[0], y1 - t_size[1] - 3)
70
+
71
+ cv2.rectangle(image, (x1, y1), c3, colors[int(class_pred)], -1)
72
+ cv2.putText(
73
+ image,
74
+ text,
75
+ (x1, y1 - 2),
76
+ cv2.FONT_HERSHEY_SIMPLEX,
77
+ 0.7,
78
+ (0, 0, 0),
79
+ bbox_thick // 2,
80
+ lineType=cv2.LINE_AA,
81
+ )
82
+
83
+ return image
84
+
85
+
86
+ class YoloCAM(BaseCAM):
87
+ def __init__(self, model, target_layers, use_cuda=False,
88
+ reshape_transform=None):
89
+ super(YoloCAM, self).__init__(model,
90
+ target_layers,
91
+ use_cuda,
92
+ reshape_transform,
93
+ uses_gradients=False)
94
+
95
+ def forward(self,
96
+ input_tensor: torch.Tensor,
97
+ scaled_anchors: torch.Tensor,
98
+ targets: List[torch.nn.Module],
99
+ eigen_smooth: bool = False) -> np.ndarray:
100
+
101
+ if self.cuda:
102
+ input_tensor = input_tensor.cuda()
103
+
104
+ if self.compute_input_gradient:
105
+ input_tensor = torch.autograd.Variable(input_tensor,
106
+ requires_grad=True)
107
+
108
+ outputs = self.activations_and_grads(input_tensor)
109
+ if targets is None:
110
+ bboxes = [[] for _ in range(1)]
111
+ for i in range(3):
112
+ batch_size, A, S, _, _ = outputs[i].shape
113
+ anchor = scaled_anchors[i]
114
+ boxes_scale_i = cells_to_bboxes(
115
+ outputs[i], anchor, S=S, is_preds=True
116
+ )
117
+ for idx, (box) in enumerate(boxes_scale_i):
118
+ bboxes[idx] += box
119
+
120
+ nms_boxes = non_max_suppression(
121
+ bboxes[0], iou_threshold=0.5, threshold=0.4, box_format="midpoint",
122
+ )
123
+ # target_categories = np.argmax(outputs.cpu().data.numpy(), axis=-1)
124
+ target_categories = [box[0] for box in nms_boxes]
125
+ targets = [ClassifierOutputTarget(
126
+ category) for category in target_categories]
127
+
128
+ if self.uses_gradients:
129
+ self.model.zero_grad()
130
+ loss = sum([target(output)
131
+ for target, output in zip(targets, outputs)])
132
+ loss.backward(retain_graph=True)
133
+
134
+ # In most of the saliency attribution papers, the saliency is
135
+ # computed with a single target layer.
136
+ # Commonly it is the last convolutional layer.
137
+ # Here we support passing a list with multiple target layers.
138
+ # It will compute the saliency image for every image,
139
+ # and then aggregate them (with a default mean aggregation).
140
+ # This gives you more flexibility in case you just want to
141
+ # use all conv layers for example, all Batchnorm layers,
142
+ # or something else.
143
+ cam_per_layer = self.compute_cam_per_layer(input_tensor,
144
+ targets,
145
+ eigen_smooth)
146
+ return self.aggregate_multi_layers(cam_per_layer)
147
+
148
+ def get_cam_image(self,
149
+ input_tensor,
150
+ target_layer,
151
+ target_category,
152
+ activations,
153
+ grads,
154
+ eigen_smooth):
155
+ return get_2d_projection(activations)
156
+
157
+
158
+ @torch.inference_mode()
159
+ def predict(cam,
160
+ model,
161
+ image: np.ndarray,
162
+ iou_thresh: float = 0.5,
163
+ thresh: float = 0.4,
164
+ show_cam: bool = False,
165
+ transparency: float = 0.5,
166
+ ) -> List[np.ndarray]:
167
+ transformed_image = _transforms(image=image)["image"].unsqueeze(0)
168
+ output = model(transformed_image)
169
+
170
+ bboxes = [[] for _ in range(1)]
171
+ for i in range(3):
172
+ batch_size, A, S, _, _ = output[i].shape
173
+ anchor = scaled_anchors[i]
174
+ boxes_scale_i = cells_to_bboxes(
175
+ output[i], anchor, S=S, is_preds=True
176
+ )
177
+ for idx, (box) in enumerate(boxes_scale_i):
178
+ bboxes[idx] += box
179
+
180
+ nms_boxes = non_max_suppression(
181
+ bboxes[0], iou_threshold=iou_thresh, threshold=thresh, box_format="midpoint",
182
+ )
183
+ plot_img = draw_predictions(image.copy(), nms_boxes, class_labels=config.PASCAL_CLASSES)
184
+ if not show_cam:
185
+ return [plot_img]
186
+
187
+ grayscale_cam = cam(transformed_image, scaled_anchors)[0, :, :]
188
+ img = cv2.resize(image, (416, 416))
189
+ img = np.float32(img) / 255
190
+ cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True, image_weight=transparency)
191
+ return [plot_img, cam_image]
192
+