manh6054 commited on
Commit
338d306
·
verified ·
1 Parent(s): ae4a184

Upload 13 files

Browse files
FPN_SSD300_a.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.lib import *
2
+
3
+ class VGG16Base(nn.Module):
4
+ """
5
+ Lấy VGG16 làm base network, tuy nhiên cần có một vài thay đổi:
6
+ - Đầu vào ảnh là 300x300 thay vì 224x224, các comment bên dưới sẽ áp dụng cho đầu vào 300x300
7
+ - Lớp pooling thứ 3 sử dụng ceiling mode thay vì floor mode
8
+ - Lớp pooling thứ 5 kernel size (2, 2) -> (3, 3) và stride 2 -> 1, và padding = 1
9
+ - Ta downsample (decimate) parameter fc6 và fc7 để tạo thành conv6 và conv7, loại bỏ hoàn toàn fc8
10
+ """
11
+
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 64, kernel_size=3, padding=1)
16
+ self.conv1_2 = nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size=3, padding=1)
17
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
18
+
19
+ self.conv2_1 = nn.Conv2d(in_channels= 64, out_channels=128, kernel_size=3, padding=1)
20
+ self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
21
+ self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
22
+
23
+ self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
24
+ self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
25
+ self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
26
+ self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
27
+
28
+ self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
29
+ self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
30
+ self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
31
+ self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
32
+
33
+ self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
34
+ self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
35
+ self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
36
+ self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
37
+
38
+ # Không còn fc layers nữa, thay vào đó là conv6 và conv7
39
+ # atrous
40
+ self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6)
41
+ self.conv7 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
42
+
43
+ def decimate(self, tensor, steps):
44
+ assert(len(steps) == tensor.dim())
45
+
46
+ for i in range(tensor.dim()):
47
+ if steps[i] is not None:
48
+ tensor = tensor.index_select(dim=i, index=torch.arange(start=0, end=tensor.shape[i], step=steps[i]))
49
+
50
+ return tensor
51
+
52
+
53
+ def load_pretrain(self):
54
+ """
55
+ load pretrain từ thư viện pytorch, decimate param lại để phù hợp với conv6 và conv7
56
+ """
57
+
58
+ state_dict = self.state_dict()
59
+ param_names = list(state_dict.keys())
60
+
61
+ # old version : torch.vision.models.vgg16(pretrain=True)
62
+ # Load model theo API mới của pytorch, cụ thể hơn tại : https://pytorch.org/vision/stable/models.html
63
+ pretrain_state_dict = torchvision.models.vgg16(weights='VGG16_Weights.DEFAULT').state_dict()
64
+ pretrain_param_names = list(pretrain_state_dict.keys())
65
+
66
+ # Pretrain param name và custom param name không giống nhau, các param chỉ cùng thứ tự như trong architecture
67
+ for idx, param_name in enumerate(param_names[:-4]): # 4 param cuối là weight và bias của conv6 và conv7, sẽ xử lí sau
68
+ state_dict[param_name] = pretrain_state_dict[pretrain_param_names[idx]]
69
+
70
+ # fc -> conv
71
+ fc6_weight = pretrain_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)
72
+ fc6_bias = pretrain_state_dict['classifier.0.bias'].view(4096)
73
+
74
+ fc7_weight = pretrain_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)
75
+ fc7_bias = pretrain_state_dict['classifier.3.bias'].view(4096)
76
+
77
+ # downsample parameter
78
+ state_dict['conv6.weight'] = self.decimate(fc6_weight, steps=[4, None, 3, 3])
79
+ state_dict['conv6.bias'] = self.decimate(fc6_bias, steps=[4])
80
+
81
+ state_dict['conv7.weight'] = self.decimate(fc7_weight, steps=[4, 4, None, None])
82
+ state_dict['conv7.bias'] = self.decimate(fc7_bias, steps=[4])
83
+
84
+ self.load_state_dict(state_dict)
85
+
86
+
87
+ def forward(self, images):
88
+ """
89
+ :param images, tensor [N, 3, 300, 300]
90
+
91
+ return:
92
+ """
93
+ out = F.relu(self.conv1_1(images)) # [N, 64, 300, 300]
94
+ out = F.relu(self.conv1_2(out)) # [N, 64, 300, 300]
95
+ out = self.pool1(out) # [N, 64, 150, 150]
96
+
97
+ out = F.relu(self.conv2_1(out)) # [N, 128, 150, 150]
98
+ out = F.relu(self.conv2_2(out)) # [N, 128, 150, 150]
99
+ out = self.pool2(out) # [N, 128, 75, 75]
100
+
101
+ out = F.relu(self.conv3_1(out)) # [N, 256, 75, 75]
102
+ out = F.relu(self.conv3_2(out)) # [N, 256, 75, 75]
103
+ out = F.relu(self.conv3_3(out)) # [N, 256, 75, 75]
104
+ out = self.pool3(out) # [N, 256, 38, 38] không phải [N, 256, 37, 37] do ceiling mode = True
105
+
106
+ out = F.relu(self.conv4_1(out)) # [N, 512, 38, 38]
107
+ out = F.relu(self.conv4_2(out)) # [N, 512, 38, 38]
108
+ out = F.relu(self.conv4_3(out)) # [N, 512, 38, 38]
109
+ conv4_3_feats = out # [N, 512, 38, 38]
110
+ out = self.pool4(out) # [N, 512, 19, 19]
111
+
112
+ out = F.relu(self.conv5_1(out)) # [N, 512, 19, 19]
113
+ out = F.relu(self.conv5_2(out)) # [N, 512, 19, 19]
114
+ out = F.relu(self.conv5_3(out)) # [N, 512, 19, 19]
115
+ out = self.pool5(out) # [N, 512, 19, 19], layer pooling này không làm thay đổi size features map
116
+
117
+ out = F.relu(self.conv6(out)) # [N, 1024, 19, 19]
118
+
119
+ conv7_feats = F.relu(self.conv7(out)) # [N, 1024, 19, 19]
120
+
121
+ return conv4_3_feats, conv7_feats
122
+
123
+
124
+ class AuxiliraryConvolutions(nn.Module):
125
+ """ Sau base network (vgg16) sẽ là các lớp conv phụ trợ
126
+ Feature Pyramid Network
127
+ """
128
+
129
+ def __init__(self):
130
+ super().__init__()
131
+
132
+ self.conv8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=0)
133
+ self.conv8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
134
+
135
+ self.conv9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=0)
136
+ self.conv9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
137
+
138
+ self.conv10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
139
+ self.conv10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
140
+
141
+ self.conv11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
142
+ self.conv11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
143
+
144
+ def init_conv2d(self):
145
+ """
146
+ Initialize convolution parameters.
147
+ """
148
+ for c in self.children():
149
+ if isinstance(c, nn.Conv2d):
150
+ nn.init.xavier_uniform_(c.weight)
151
+ if c.bias is not None:
152
+ nn.init.constant_(c.bias, 0.)
153
+
154
+
155
+ def forward(self, conv7_feats):
156
+ """
157
+ :param conv8_feats, tensor [N, 1024, 19, 19]
158
+ """
159
+
160
+ out = F.relu(self.conv8_1(conv7_feats)) # [N, 256, 19, 19]
161
+ out = F.relu(self.conv8_2(out)) # [N, 512, 10, 10]
162
+ conv8_2_feats = out # [N, 512, 10, 10]
163
+
164
+ out = F.relu(self.conv9_1(out)) # [N, 128, 10, 10]
165
+ out = F.relu(self.conv9_2(out)) # [N, 256, 5, 5]
166
+ conv9_2_feats = out # [N, 256, 5, 5]
167
+
168
+ out = F.relu(self.conv10_1(out)) # [N, 128, 5, 5]
169
+ out = F.relu(self.conv10_2(out)) # [N, 256, 3, 3]
170
+ conv10_2_feats = out # [N, 256, 3, 3]
171
+
172
+ out = F.relu(self.conv11_1(out)) # [N, 128, 3, 3]
173
+ conv11_2_feats = F.relu(self.conv11_2(out)) # [N, 256, 1, 1]
174
+
175
+ return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
176
+
177
+ class FPNConvolutions(nn.Module):
178
+ """
179
+ conv3_3_feats : [N, 256, 75, 75]
180
+ conv4_3_feats : [N, 512, 38, 38]
181
+ conv7_feats : [N, 1024, 19, 19]
182
+ conv8_2_feats : [N, 512, 10, 10]
183
+ conv9_2_feats : [N, 256, 5, 5]
184
+ conv10_2_feats : [N, 256, 3, 3]
185
+ conv11_2_feats : [N, 256, 1, 1]
186
+ """
187
+
188
+ def __init__(self):
189
+ super().__init__()
190
+
191
+ self.fp5_upsample = nn.Upsample(scale_factor=3, mode="bilinear")
192
+ self.fp5_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, bias=False)
193
+ self.fp5_bn = nn.BatchNorm2d(num_features=256)
194
+
195
+ self.fp4_upsample = nn.Upsample(scale_factor=5/3, mode="bilinear")
196
+ self.fp4_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, bias=False)
197
+ self.fp4_bn = nn.BatchNorm2d(num_features=256)
198
+
199
+ self.fp3_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
200
+ self.fp3_conv1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1, bias=False)
201
+ self.fp3_bn = nn.BatchNorm2d(num_features=512)
202
+
203
+ self.fp2_upsample = nn.Upsample(scale_factor=1.9, mode="bilinear")
204
+ self.fp2_conv1 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1, bias=False)
205
+ self.fp2_bn = nn.BatchNorm2d(num_features=1024)
206
+
207
+ self.fp1_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
208
+ self.fp1_conv1 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, bias=False)
209
+ self.fp1_bn = nn.BatchNorm2d(num_features=512)
210
+
211
+
212
+ def init_conv2d(self):
213
+ """
214
+ Initialize convolution parameters.
215
+ """
216
+ for c in self.children():
217
+ if isinstance(c, nn.Conv2d):
218
+ nn.init.xavier_uniform_(c.weight)
219
+ if c.bias is not None:
220
+ nn.init.constant_(c.bias, 0.)
221
+
222
+ def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats ,conv11_2_feats):
223
+
224
+ fp6_feats = conv11_2_feats
225
+
226
+ out = self.fp5_upsample(conv11_2_feats)
227
+ out = F.relu(F.relu(self.fp5_conv1(out)) + conv10_2_feats)
228
+ fp5_feats = self.fp5_bn(out)
229
+
230
+ out = self.fp4_upsample(out)
231
+ out = F.relu(F.relu(self.fp4_conv1(out)) + conv9_2_feats)
232
+ fp4_feats = self.fp4_bn(out)
233
+
234
+ out = self.fp3_upsample(out)
235
+ out = F.relu(F.relu(self.fp3_conv1(out)) + conv8_2_feats)
236
+ fp3_feats = self.fp3_bn(out)
237
+
238
+ out = self.fp2_upsample(out)
239
+ out = F.relu(F.relu(self.fp2_conv1(out)) + conv7_feats)
240
+ fp2_feats = self.fp2_bn(out)
241
+
242
+ out = self.fp1_upsample(out)
243
+ out = F.relu(F.relu(self.fp1_conv1(out)) + conv4_3_feats)
244
+ fp1_feats = self.fp1_bn(out)
245
+
246
+ return fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats
247
+
248
+ class PredictionConvolutions(nn.Module):
249
+ """Layer cuối là để predict offset và conf
250
+
251
+ """
252
+
253
+ def __init__(self, n_classes=21):
254
+ super().__init__()
255
+
256
+ self.n_classes = n_classes
257
+
258
+ n_boxes={
259
+ 'fp1' : 4,
260
+ 'fp2' : 6,
261
+ 'fp3' : 6,
262
+ 'fp4' : 6,
263
+ 'fp5' : 4,
264
+ 'fp6' : 4
265
+ }
266
+
267
+ # kernel size = 3 và padding = 1 không làm thay đổi kích thước feature map
268
+
269
+ self.loc_fp6 = nn.Conv2d(256, n_boxes['fp6']*4, kernel_size=3, padding=1)
270
+ self.loc_fp5 = nn.Conv2d(256, n_boxes['fp5']*4, kernel_size=3, padding=1)
271
+ self.loc_fp4 = nn.Conv2d(256, n_boxes['fp4']*4, kernel_size=3, padding=1)
272
+ self.loc_fp3 = nn.Conv2d(512, n_boxes['fp3']*4, kernel_size=3, padding=1)
273
+ self.loc_fp2 = nn.Conv2d(1024, n_boxes['fp2']*4, kernel_size=3, padding=1)
274
+ self.loc_fp1 = nn.Conv2d(512, n_boxes['fp1']*4, kernel_size=3, padding=1)
275
+
276
+
277
+ self.conf_fp6 = nn.Conv2d(256, n_boxes['fp6']*n_classes, kernel_size=3, padding=1)
278
+ self.conf_fp5 = nn.Conv2d(256, n_boxes['fp5']*n_classes, kernel_size=3, padding=1)
279
+ self.conf_fp4 = nn.Conv2d(256, n_boxes['fp4']*n_classes, kernel_size=3, padding=1)
280
+ self.conf_fp3 = nn.Conv2d(512, n_boxes['fp3']*n_classes, kernel_size=3, padding=1)
281
+ self.conf_fp2 = nn.Conv2d(1024, n_boxes['fp2']*n_classes, kernel_size=3, padding=1)
282
+ self.conf_fp1 = nn.Conv2d(512, n_boxes['fp1']*n_classes, kernel_size=3, padding=1)
283
+
284
+ def init_conv2d(self):
285
+ """
286
+ Initialize convolution parameters.
287
+ """
288
+ for c in self.children():
289
+ if isinstance(c, nn.Conv2d):
290
+ nn.init.xavier_uniform_(c.weight)
291
+ if c.bias is not None:
292
+ nn.init.constant_(c.bias, 0.)
293
+
294
+ def forward(self, fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats):
295
+
296
+ batch_size = fp1_feats.shape[0]
297
+
298
+
299
+ loc_fp1 = self.loc_fp1(fp1_feats)
300
+ loc_fp1 = loc_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
301
+
302
+ loc_fp2 = self.loc_fp2(fp2_feats)
303
+ loc_fp2 = loc_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
304
+
305
+ loc_fp3 = self.loc_fp3(fp3_feats)
306
+ loc_fp3 = loc_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
307
+
308
+ loc_fp4 = self.loc_fp4(fp4_feats)
309
+ loc_fp4 = loc_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
310
+
311
+ loc_fp5 = self.loc_fp5(fp5_feats)
312
+ loc_fp5 = loc_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
313
+
314
+ loc_fp6 = self.loc_fp6(fp6_feats)
315
+ loc_fp6 = loc_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
316
+
317
+
318
+
319
+ conf_fp1 = self.conf_fp1(fp1_feats)
320
+ conf_fp1 = conf_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
321
+
322
+ conf_fp2 = self.conf_fp2(fp2_feats)
323
+ conf_fp2 = conf_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
324
+
325
+ conf_fp3 = self.conf_fp3(fp3_feats)
326
+ conf_fp3 = conf_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
327
+
328
+ conf_fp4 = self.conf_fp4(fp4_feats)
329
+ conf_fp4 = conf_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
330
+
331
+ conf_fp5 = self.conf_fp5(fp5_feats)
332
+ conf_fp5 = conf_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
333
+
334
+ conf_fp6 = self.conf_fp6(fp6_feats)
335
+ conf_fp6 = conf_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
336
+
337
+
338
+ loc = torch.cat((loc_fp1, loc_fp2, loc_fp3, loc_fp4, loc_fp5, loc_fp6), dim=1)
339
+ conf = torch.cat((conf_fp1, conf_fp2, conf_fp3, conf_fp4, conf_fp5, conf_fp6), dim=1)
340
+
341
+ return loc, conf
342
+
343
+
344
+ class L2Norm(nn.Module):
345
+ def __init__(self, input_channel, scale=20.):
346
+ super().__init__()
347
+ self.scale_factors = nn.Parameter(torch.FloatTensor(1, input_channel, 1, 1))
348
+ self.eps = 1e-10
349
+ nn.init.constant_(self.scale_factors, scale)
350
+
351
+ def forward(self, tensor):
352
+ norm = tensor.pow(2).sum(dim=1, keepdim=True).sqrt()
353
+ tensor = tensor/(norm + self.eps)*self.scale_factors
354
+ return tensor
355
+
356
+ class FPN_SSD300(nn.Module):
357
+
358
+ def __init__(self, pretrain_path = None, data_train_on = "VOC", n_classes = 21):
359
+ super().__init__()
360
+
361
+ self.n_classes = n_classes
362
+ self.data_train_on = data_train_on
363
+ self.base_net = VGG16Base()
364
+ self.auxi_conv = AuxiliraryConvolutions()
365
+ self.fp_conv = FPNConvolutions()
366
+ self.pred_conv = PredictionConvolutions(n_classes)
367
+ self.l2_conv4_3 = L2Norm(input_channel=512)
368
+
369
+ if pretrain_path is not None:
370
+ self.load_state_dict(torch.load(pretrain_path))
371
+ else:
372
+ self.base_net.load_pretrain()
373
+ self.auxi_conv.init_conv2d()
374
+ self.fp_conv.init_conv2d()
375
+ self.pred_conv.init_conv2d()
376
+
377
+ def create_prior_boxes(self):
378
+ """
379
+ mỗi box có dạng [cx, cy, w, h] được scale
380
+ """
381
+ # kích thước feature map tương ứng
382
+ fmap_sizes = [38, 19, 10, 5, 3, 1]
383
+
384
+ # scale như trong paper và được tính sẵn thay vì công thức
385
+ # lưu ý ở conv4_3, tác giả xét như một trường hợp đặc biệt (scale 0.1):
386
+ # Ở mục 3.1, trang 7 :
387
+ # "We set default box with scale 0.1 on conv4 3 .... "
388
+ # "For SSD512 model, we add extra conv12 2 for prediction, set smin to 0.15, and 0.07 on conv4 3...""
389
+
390
+ if self.data_train_on == "VOC":
391
+ box_scales = [0.1, 0.2, 0.375, 0.55, 0.725, 0.9]
392
+ elif self.data_train_on == "COCO":
393
+ box_scales = [0.07, 0.15, 0.3375, 0.525, 0.7125, 0.9]
394
+
395
+ aspect_ratios = [
396
+ [1., 2., 0.5],
397
+ [1., 2., 3., 0.5, 0.333],
398
+ [1., 2., 3., 0.5, 0.333],
399
+ [1., 2., 3., 0.5, 0.333],
400
+ [1., 2., 0.5],
401
+ [1., 2., 0.5]
402
+ ]
403
+ dboxes = []
404
+
405
+
406
+ for idx, fmap_size in enumerate(fmap_sizes):
407
+ for i in range(fmap_size):
408
+ for j in range(fmap_size):
409
+
410
+ # lưu ý, cx trong ảnh là trục hoành, do đó j + 0.5 chứ không phải i + 0.5
411
+ cx = (j + 0.5) / fmap_size
412
+ cy = (i + 0.5) / fmap_size
413
+
414
+ for aspect_ratio in aspect_ratios[idx]:
415
+ scale = box_scales[idx]
416
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
417
+
418
+ if aspect_ratio == 1.:
419
+ try:
420
+ scale = sqrt(scale*box_scales[idx + 1])
421
+ except IndexError:
422
+ scale = 1.
423
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
424
+
425
+ dboxes = torch.FloatTensor(dboxes)
426
+
427
+ #dboxes = pascalVOC_style(dboxes)
428
+ dboxes.clamp_(0, 1)
429
+ #dboxes = yolo_style(dboxes)
430
+
431
+ return dboxes
432
+
433
+ def forward(self, images):
434
+ conv4_3_feats, conv7_feats = self.base_net(images)
435
+ conv4_3_feats = self.l2_conv4_3(conv4_3_feats)
436
+ conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.auxi_conv(conv7_feats)
437
+
438
+ FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats = self.fp_conv(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
439
+
440
+ loc, conf = self.pred_conv(FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats)
441
+ return loc, conf
442
+
443
+
444
+
445
+ if __name__ == "__main__":
446
+ T = FPN_SSD300()
447
+ img = torch.ones(1, 3, 300, 300)
448
+ loc, conf = T(img)
449
+ print(loc.shape)
450
+ print(conf.shape)
FPN_SSD300_b.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.lib import *
2
+
3
+ class VGG16Base(nn.Module):
4
+ """
5
+ Lấy VGG16 làm base network, tuy nhiên cần có một vài thay đổi:
6
+ - Đầu vào ảnh là 300x300 thay vì 224x224, các comment bên dưới sẽ áp dụng cho đầu vào 300x300
7
+ - Lớp pooling thứ 3 sử dụng ceiling mode thay vì floor mode
8
+ - Lớp pooling thứ 5 kernel size (2, 2) -> (3, 3) và stride 2 -> 1, và padding = 1
9
+ - Ta downsample (decimate) parameter fc6 và fc7 để tạo thành conv6 và conv7, loại bỏ hoàn toàn fc8
10
+ """
11
+
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 64, kernel_size=3, padding=1)
16
+ self.conv1_2 = nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size=3, padding=1)
17
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
18
+
19
+ self.conv2_1 = nn.Conv2d(in_channels= 64, out_channels=128, kernel_size=3, padding=1)
20
+ self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
21
+ self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
22
+
23
+ self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
24
+ self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
25
+ self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
26
+ self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
27
+
28
+ self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
29
+ self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
30
+ self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
31
+ self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
32
+
33
+ self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
34
+ self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
35
+ self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
36
+ self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
37
+
38
+ # Không còn fc layers nữa, thay vào đó là conv6 và conv7
39
+ # atrous
40
+ self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6)
41
+ self.conv7 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
42
+
43
+ def decimate(self, tensor, steps):
44
+ assert(len(steps) == tensor.dim())
45
+
46
+ for i in range(tensor.dim()):
47
+ if steps[i] is not None:
48
+ tensor = tensor.index_select(dim=i, index=torch.arange(start=0, end=tensor.shape[i], step=steps[i]))
49
+
50
+ return tensor
51
+
52
+
53
+ def load_pretrain(self):
54
+ """
55
+ load pretrain từ thư viện pytorch, decimate param lại để phù hợp với conv6 và conv7
56
+ """
57
+
58
+ state_dict = self.state_dict()
59
+ param_names = list(state_dict.keys())
60
+
61
+ # old version : torch.vision.models.vgg16(pretrain=True)
62
+ # Load model theo API mới của pytorch, cụ thể hơn tại : https://pytorch.org/vision/stable/models.html
63
+ pretrain_state_dict = torchvision.models.vgg16(weights='VGG16_Weights.DEFAULT').state_dict()
64
+ pretrain_param_names = list(pretrain_state_dict.keys())
65
+
66
+ # Pretrain param name và custom param name không giống nhau, các param chỉ cùng thứ tự như trong architecture
67
+ for idx, param_name in enumerate(param_names[:-4]): # 4 param cuối là weight và bias của conv6 và conv7, sẽ xử lí sau
68
+ state_dict[param_name] = pretrain_state_dict[pretrain_param_names[idx]]
69
+
70
+ # fc -> conv
71
+ fc6_weight = pretrain_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)
72
+ fc6_bias = pretrain_state_dict['classifier.0.bias'].view(4096)
73
+
74
+ fc7_weight = pretrain_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)
75
+ fc7_bias = pretrain_state_dict['classifier.3.bias'].view(4096)
76
+
77
+ # downsample parameter
78
+ state_dict['conv6.weight'] = self.decimate(fc6_weight, steps=[4, None, 3, 3])
79
+ state_dict['conv6.bias'] = self.decimate(fc6_bias, steps=[4])
80
+
81
+ state_dict['conv7.weight'] = self.decimate(fc7_weight, steps=[4, 4, None, None])
82
+ state_dict['conv7.bias'] = self.decimate(fc7_bias, steps=[4])
83
+
84
+ self.load_state_dict(state_dict)
85
+
86
+
87
+ def forward(self, images):
88
+ """
89
+ :param images, tensor [N, 3, 300, 300]
90
+
91
+ return:
92
+ """
93
+ out = F.relu(self.conv1_1(images)) # [N, 64, 300, 300]
94
+ out = F.relu(self.conv1_2(out)) # [N, 64, 300, 300]
95
+ out = self.pool1(out) # [N, 64, 150, 150]
96
+
97
+ out = F.relu(self.conv2_1(out)) # [N, 128, 150, 150]
98
+ out = F.relu(self.conv2_2(out)) # [N, 128, 150, 150]
99
+ out = self.pool2(out) # [N, 128, 75, 75]
100
+
101
+ out = F.relu(self.conv3_1(out)) # [N, 256, 75, 75]
102
+ out = F.relu(self.conv3_2(out)) # [N, 256, 75, 75]
103
+ out = F.relu(self.conv3_3(out)) # [N, 256, 75, 75]
104
+ out = self.pool3(out) # [N, 256, 38, 38] không phải [N, 256, 37, 37] do ceiling mode = True
105
+
106
+ out = F.relu(self.conv4_1(out)) # [N, 512, 38, 38]
107
+ out = F.relu(self.conv4_2(out)) # [N, 512, 38, 38]
108
+ out = F.relu(self.conv4_3(out)) # [N, 512, 38, 38]
109
+ conv4_3_feats = out # [N, 512, 38, 38]
110
+ out = self.pool4(out) # [N, 512, 19, 19]
111
+
112
+ out = F.relu(self.conv5_1(out)) # [N, 512, 19, 19]
113
+ out = F.relu(self.conv5_2(out)) # [N, 512, 19, 19]
114
+ out = F.relu(self.conv5_3(out)) # [N, 512, 19, 19]
115
+ out = self.pool5(out) # [N, 512, 19, 19], layer pooling này không làm thay đổi size features map
116
+
117
+ out = F.relu(self.conv6(out)) # [N, 1024, 19, 19]
118
+
119
+ conv7_feats = F.relu(self.conv7(out)) # [N, 1024, 19, 19]
120
+
121
+ return conv4_3_feats, conv7_feats
122
+
123
+
124
+ class AuxiliraryConvolutions(nn.Module):
125
+ """ Sau base network (vgg16) sẽ là các lớp conv phụ trợ
126
+ Feature Pyramid Network
127
+ """
128
+
129
+ def __init__(self):
130
+ super().__init__()
131
+
132
+ self.conv8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=0)
133
+ self.conv8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
134
+
135
+ self.conv9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=0)
136
+ self.conv9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
137
+
138
+ self.conv10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
139
+ self.conv10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
140
+
141
+ self.conv11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
142
+ self.conv11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
143
+
144
+ def init_conv2d(self):
145
+ """
146
+ Initialize convolution parameters.
147
+ """
148
+ for c in self.children():
149
+ if isinstance(c, nn.Conv2d):
150
+ nn.init.xavier_uniform_(c.weight)
151
+ if c.bias is not None:
152
+ nn.init.constant_(c.bias, 0.)
153
+
154
+
155
+ def forward(self, conv7_feats):
156
+ """
157
+ :param conv8_feats, tensor [N, 1024, 19, 19]
158
+ """
159
+
160
+ out = F.relu(self.conv8_1(conv7_feats)) # [N, 256, 19, 19]
161
+ out = F.relu(self.conv8_2(out)) # [N, 512, 10, 10]
162
+ conv8_2_feats = out # [N, 512, 10, 10]
163
+
164
+ out = F.relu(self.conv9_1(out)) # [N, 128, 10, 10]
165
+ out = F.relu(self.conv9_2(out)) # [N, 256, 5, 5]
166
+ conv9_2_feats = out # [N, 256, 5, 5]
167
+
168
+ out = F.relu(self.conv10_1(out)) # [N, 128, 5, 5]
169
+ out = F.relu(self.conv10_2(out)) # [N, 256, 3, 3]
170
+ conv10_2_feats = out # [N, 256, 3, 3]
171
+
172
+ out = F.relu(self.conv11_1(out)) # [N, 128, 3, 3]
173
+ conv11_2_feats = F.relu(self.conv11_2(out)) # [N, 256, 1, 1]
174
+
175
+ return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
176
+
177
+ class FPNConvolutions(nn.Module):
178
+ """
179
+ conv3_3_feats : [N, 256, 75, 75]
180
+ conv4_3_feats : [N, 512, 38, 38]
181
+ conv7_feats : [N, 1024, 19, 19]
182
+ conv8_2_feats : [N, 512, 10, 10]
183
+ conv9_2_feats : [N, 256, 5, 5]
184
+ conv10_2_feats : [N, 256, 3, 3]
185
+ conv11_2_feats : [N, 256, 1, 1]
186
+ """
187
+
188
+ def __init__(self):
189
+ super().__init__()
190
+
191
+ self.fp5_upsample = nn.Upsample(scale_factor=3, mode="bilinear")
192
+ self.fp5_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
193
+ self.fp5_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1, bias=False)
194
+ self.fp5_bn1 = nn.BatchNorm2d(num_features=256)
195
+ self.fp5_bn2 = nn.BatchNorm2d(num_features=256)
196
+
197
+ self.fp4_upsample = nn.Upsample(scale_factor=5/3, mode="bilinear")
198
+ self.fp4_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
199
+ self.fp4_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1, bias=False)
200
+ self.fp4_bn1 = nn.BatchNorm2d(num_features=256)
201
+ self.fp4_bn2 = nn.BatchNorm2d(num_features=256)
202
+
203
+ self.fp3_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
204
+ self.fp3_conv1 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1)
205
+ self.fp3_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1, bias=False)
206
+ self.fp3_bn1 = nn.BatchNorm2d(num_features=256)
207
+ self.fp3_bn2 = nn.BatchNorm2d(num_features=256)
208
+
209
+ self.fp2_upsample = nn.Upsample(scale_factor=1.9, mode="bilinear")
210
+ self.fp2_conv1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1)
211
+ self.fp2_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1, bias=False)
212
+ self.fp2_bn1 = nn.BatchNorm2d(num_features=256)
213
+ self.fp2_bn2 = nn.BatchNorm2d(num_features=256)
214
+
215
+ self.fp1_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
216
+ self.fp1_conv1 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1)
217
+ self.fp1_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1, bias=False)
218
+ self.fp1_bn1 = nn.BatchNorm2d(num_features=256)
219
+ self.fp1_bn2 = nn.BatchNorm2d(num_features=256)
220
+
221
+
222
+ def init_conv2d(self):
223
+ """
224
+ Initialize convolution parameters.
225
+ """
226
+ for c in self.children():
227
+ if isinstance(c, nn.Conv2d):
228
+ nn.init.xavier_uniform_(c.weight)
229
+ if c.bias is not None:
230
+ nn.init.constant_(c.bias, 0.)
231
+
232
+ def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats ,conv11_2_feats):
233
+
234
+ fp6_feats = conv11_2_feats
235
+
236
+ out = self.fp5_upsample(conv11_2_feats)
237
+ out = F.relu(out + self.fp5_bn1(F.relu(self.fp5_conv1(conv10_2_feats))))
238
+ fp5_feats = self.fp5_bn2(F.relu(self.fp5_conv2(out)))
239
+
240
+ out = self.fp4_upsample(out)
241
+ out = F.relu(out + self.fp4_bn1(F.relu(self.fp4_conv1(conv9_2_feats))))
242
+ fp4_feats = self.fp4_bn2(F.relu(self.fp4_conv2(out)))
243
+
244
+ out = self.fp3_upsample(out)
245
+ out = F.relu(out + self.fp3_bn1(F.relu(self.fp3_conv1(conv8_2_feats))))
246
+ fp3_feats = self.fp3_bn2(F.relu(self.fp3_conv2(out)))
247
+
248
+ out = self.fp2_upsample(out)
249
+ out = F.relu(out + self.fp2_bn1(F.relu(self.fp2_conv1(conv7_feats))))
250
+ fp2_feats = self.fp2_bn2(F.relu(self.fp2_conv2(out)))
251
+
252
+ out = self.fp1_upsample(out)
253
+ out = F.relu(out + self.fp1_bn1(F.relu(self.fp1_conv1(conv4_3_feats))))
254
+ fp1_feats = self.fp1_bn2(F.relu(self.fp1_conv2(out)))
255
+
256
+ return fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats
257
+
258
+ class PredictionConvolutions(nn.Module):
259
+ """Layer cuối là để predict offset và conf
260
+
261
+ """
262
+
263
+ def __init__(self, n_classes=21):
264
+ super().__init__()
265
+
266
+ self.n_classes = n_classes
267
+
268
+ n_boxes={
269
+ 'fp1' : 4,
270
+ 'fp2' : 6,
271
+ 'fp3' : 6,
272
+ 'fp4' : 6,
273
+ 'fp5' : 4,
274
+ 'fp6' : 4
275
+ }
276
+
277
+ # kernel size = 3 và padding = 1 không làm thay đổi kích thước feature map
278
+
279
+ self.loc_fp6 = nn.Conv2d(256, n_boxes['fp6']*4, kernel_size=3, padding=1)
280
+ self.loc_fp5 = nn.Conv2d(256, n_boxes['fp5']*4, kernel_size=3, padding=1)
281
+ self.loc_fp4 = nn.Conv2d(256, n_boxes['fp4']*4, kernel_size=3, padding=1)
282
+ self.loc_fp3 = nn.Conv2d(256, n_boxes['fp3']*4, kernel_size=3, padding=1)
283
+ self.loc_fp2 = nn.Conv2d(256, n_boxes['fp2']*4, kernel_size=3, padding=1)
284
+ self.loc_fp1 = nn.Conv2d(256, n_boxes['fp1']*4, kernel_size=3, padding=1)
285
+
286
+
287
+ self.conf_fp6 = nn.Conv2d(256, n_boxes['fp6']*n_classes, kernel_size=3, padding=1)
288
+ self.conf_fp5 = nn.Conv2d(256, n_boxes['fp5']*n_classes, kernel_size=3, padding=1)
289
+ self.conf_fp4 = nn.Conv2d(256, n_boxes['fp4']*n_classes, kernel_size=3, padding=1)
290
+ self.conf_fp3 = nn.Conv2d(256, n_boxes['fp3']*n_classes, kernel_size=3, padding=1)
291
+ self.conf_fp2 = nn.Conv2d(256, n_boxes['fp2']*n_classes, kernel_size=3, padding=1)
292
+ self.conf_fp1 = nn.Conv2d(256, n_boxes['fp1']*n_classes, kernel_size=3, padding=1)
293
+
294
+ def init_conv2d(self):
295
+ """
296
+ Initialize convolution parameters.
297
+ """
298
+ for c in self.children():
299
+ if isinstance(c, nn.Conv2d):
300
+ nn.init.xavier_uniform_(c.weight)
301
+ if c.bias is not None:
302
+ nn.init.constant_(c.bias, 0.)
303
+
304
+ def forward(self, fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats):
305
+
306
+ batch_size = fp1_feats.shape[0]
307
+
308
+
309
+ loc_fp1 = self.loc_fp1(fp1_feats)
310
+ loc_fp1 = loc_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
311
+
312
+ loc_fp2 = self.loc_fp2(fp2_feats)
313
+ loc_fp2 = loc_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
314
+
315
+ loc_fp3 = self.loc_fp3(fp3_feats)
316
+ loc_fp3 = loc_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
317
+
318
+ loc_fp4 = self.loc_fp4(fp4_feats)
319
+ loc_fp4 = loc_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
320
+
321
+ loc_fp5 = self.loc_fp5(fp5_feats)
322
+ loc_fp5 = loc_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
323
+
324
+ loc_fp6 = self.loc_fp6(fp6_feats)
325
+ loc_fp6 = loc_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
326
+
327
+
328
+
329
+ conf_fp1 = self.conf_fp1(fp1_feats)
330
+ conf_fp1 = conf_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
331
+
332
+ conf_fp2 = self.conf_fp2(fp2_feats)
333
+ conf_fp2 = conf_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
334
+
335
+ conf_fp3 = self.conf_fp3(fp3_feats)
336
+ conf_fp3 = conf_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
337
+
338
+ conf_fp4 = self.conf_fp4(fp4_feats)
339
+ conf_fp4 = conf_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
340
+
341
+ conf_fp5 = self.conf_fp5(fp5_feats)
342
+ conf_fp5 = conf_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
343
+
344
+ conf_fp6 = self.conf_fp6(fp6_feats)
345
+ conf_fp6 = conf_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
346
+
347
+
348
+ loc = torch.cat((loc_fp1, loc_fp2, loc_fp3, loc_fp4, loc_fp5, loc_fp6), dim=1)
349
+ conf = torch.cat((conf_fp1, conf_fp2, conf_fp3, conf_fp4, conf_fp5, conf_fp6), dim=1)
350
+
351
+ return loc, conf
352
+
353
+
354
+ class L2Norm(nn.Module):
355
+ def __init__(self, input_channel, scale=20.):
356
+ super().__init__()
357
+ self.scale_factors = nn.Parameter(torch.FloatTensor(1, input_channel, 1, 1))
358
+ self.eps = 1e-10
359
+ nn.init.constant_(self.scale_factors, scale)
360
+
361
+ def forward(self, tensor):
362
+ norm = tensor.pow(2).sum(dim=1, keepdim=True).sqrt()
363
+ tensor = tensor/(norm + self.eps)*self.scale_factors
364
+ return tensor
365
+
366
+ class FPN_SSD300(nn.Module):
367
+
368
+ def __init__(self, pretrain_path = None, data_train_on = "VOC", n_classes = 21):
369
+ super().__init__()
370
+
371
+ self.n_classes = n_classes
372
+ self.data_train_on = data_train_on
373
+ self.base_net = VGG16Base()
374
+ self.auxi_conv = AuxiliraryConvolutions()
375
+ self.fp_conv = FPNConvolutions()
376
+ self.pred_conv = PredictionConvolutions(n_classes)
377
+ self.l2_conv4_3 = L2Norm(input_channel=512)
378
+
379
+ if pretrain_path is not None:
380
+ self.load_state_dict(torch.load(pretrain_path))
381
+ else:
382
+ self.base_net.load_pretrain()
383
+ self.auxi_conv.init_conv2d()
384
+ self.fp_conv.init_conv2d()
385
+ self.pred_conv.init_conv2d()
386
+
387
+ def create_prior_boxes(self):
388
+ """
389
+ mỗi box có dạng [cx, cy, w, h] được scale
390
+ """
391
+ # kích thước feature map tương ứng
392
+ fmap_sizes = [38, 19, 10, 5, 3, 1]
393
+
394
+ # scale như trong paper và được tính sẵn thay vì công thức
395
+ # lưu ý ở conv4_3, tác giả xét như một trường hợp đặc biệt (scale 0.1):
396
+ # Ở mục 3.1, trang 7 :
397
+ # "We set default box with scale 0.1 on conv4 3 .... "
398
+ # "For SSD512 model, we add extra conv12 2 for prediction, set smin to 0.15, and 0.07 on conv4 3...""
399
+
400
+ if self.data_train_on == "VOC":
401
+ box_scales = [0.1, 0.2, 0.375, 0.55, 0.725, 0.9]
402
+ elif self.data_train_on == "COCO":
403
+ box_scales = [0.07, 0.15, 0.3375, 0.525, 0.7125, 0.9]
404
+
405
+ aspect_ratios = [
406
+ [1., 2., 0.5],
407
+ [1., 2., 3., 0.5, 0.333],
408
+ [1., 2., 3., 0.5, 0.333],
409
+ [1., 2., 3., 0.5, 0.333],
410
+ [1., 2., 0.5],
411
+ [1., 2., 0.5]
412
+ ]
413
+ dboxes = []
414
+
415
+
416
+ for idx, fmap_size in enumerate(fmap_sizes):
417
+ for i in range(fmap_size):
418
+ for j in range(fmap_size):
419
+
420
+ # lưu ý, cx trong ảnh là trục hoành, do đó j + 0.5 chứ không phải i + 0.5
421
+ cx = (j + 0.5) / fmap_size
422
+ cy = (i + 0.5) / fmap_size
423
+
424
+ for aspect_ratio in aspect_ratios[idx]:
425
+ scale = box_scales[idx]
426
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
427
+
428
+ if aspect_ratio == 1.:
429
+ try:
430
+ scale = sqrt(scale*box_scales[idx + 1])
431
+ except IndexError:
432
+ scale = 1.
433
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
434
+
435
+ dboxes = torch.FloatTensor(dboxes)
436
+
437
+ #dboxes = pascalVOC_style(dboxes)
438
+ dboxes.clamp_(0, 1)
439
+ #dboxes = yolo_style(dboxes)
440
+
441
+ return dboxes
442
+
443
+ def forward(self, images):
444
+ conv4_3_feats, conv7_feats = self.base_net(images)
445
+ conv4_3_feats = self.l2_conv4_3(conv4_3_feats)
446
+ conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.auxi_conv(conv7_feats)
447
+
448
+ FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats = self.fp_conv(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
449
+
450
+ loc, conf = self.pred_conv(FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats)
451
+ return loc, conf
452
+
453
+
454
+
455
+ if __name__ == "__main__":
456
+ img = torch.ones(1, 3, 300, 300)
457
+ loc, conf = T(img)
458
+ print(loc.shape)
459
+ print(conf.shape)
FPN_SSD300_c.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.lib import *
2
+
3
+ class VGG16Base(nn.Module):
4
+ """
5
+ Lấy VGG16 làm base network, tuy nhiên cần có một vài thay đổi:
6
+ - Đầu vào ảnh là 300x300 thay vì 224x224, các comment bên dưới sẽ áp dụng cho đầu vào 300x300
7
+ - Lớp pooling thứ 3 sử dụng ceiling mode thay vì floor mode
8
+ - Lớp pooling thứ 5 kernel size (2, 2) -> (3, 3) và stride 2 -> 1, và padding = 1
9
+ - Ta downsample (decimate) parameter fc6 và fc7 để tạo thành conv6 và conv7, loại bỏ hoàn toàn fc8
10
+ """
11
+
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 64, kernel_size=3, padding=1)
16
+ self.conv1_2 = nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size=3, padding=1)
17
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
18
+
19
+ self.conv2_1 = nn.Conv2d(in_channels= 64, out_channels=128, kernel_size=3, padding=1)
20
+ self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
21
+ self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
22
+
23
+ self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
24
+ self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
25
+ self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
26
+ self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
27
+
28
+ self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
29
+ self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
30
+ self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
31
+ self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
32
+
33
+ self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
34
+ self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
35
+ self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
36
+ self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
37
+
38
+ # Không còn fc layers nữa, thay vào đó là conv6 và conv7
39
+ # atrous
40
+ self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6)
41
+ self.conv7 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
42
+
43
+ def decimate(self, tensor, steps):
44
+ assert(len(steps) == tensor.dim())
45
+
46
+ for i in range(tensor.dim()):
47
+ if steps[i] is not None:
48
+ tensor = tensor.index_select(dim=i, index=torch.arange(start=0, end=tensor.shape[i], step=steps[i]))
49
+
50
+ return tensor
51
+
52
+
53
+ def load_pretrain(self):
54
+ """
55
+ load pretrain từ thư viện pytorch, decimate param lại để phù hợp với conv6 và conv7
56
+ """
57
+
58
+ state_dict = self.state_dict()
59
+ param_names = list(state_dict.keys())
60
+
61
+ # old version : torch.vision.models.vgg16(pretrain=True)
62
+ # Load model theo API mới của pytorch, cụ thể hơn tại : https://pytorch.org/vision/stable/models.html
63
+ pretrain_state_dict = torchvision.models.vgg16(weights='VGG16_Weights.DEFAULT').state_dict()
64
+ pretrain_param_names = list(pretrain_state_dict.keys())
65
+
66
+ # Pretrain param name và custom param name không giống nhau, các param chỉ cùng thứ tự như trong architecture
67
+ for idx, param_name in enumerate(param_names[:-4]): # 4 param cuối là weight và bias của conv6 và conv7, sẽ xử lí sau
68
+ state_dict[param_name] = pretrain_state_dict[pretrain_param_names[idx]]
69
+
70
+ # fc -> conv
71
+ fc6_weight = pretrain_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)
72
+ fc6_bias = pretrain_state_dict['classifier.0.bias'].view(4096)
73
+
74
+ fc7_weight = pretrain_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)
75
+ fc7_bias = pretrain_state_dict['classifier.3.bias'].view(4096)
76
+
77
+ # downsample parameter
78
+ state_dict['conv6.weight'] = self.decimate(fc6_weight, steps=[4, None, 3, 3])
79
+ state_dict['conv6.bias'] = self.decimate(fc6_bias, steps=[4])
80
+
81
+ state_dict['conv7.weight'] = self.decimate(fc7_weight, steps=[4, 4, None, None])
82
+ state_dict['conv7.bias'] = self.decimate(fc7_bias, steps=[4])
83
+
84
+ self.load_state_dict(state_dict)
85
+
86
+
87
+ def forward(self, images):
88
+ """
89
+ :param images, tensor [N, 3, 300, 300]
90
+
91
+ return:
92
+ """
93
+ out = F.relu(self.conv1_1(images)) # [N, 64, 300, 300]
94
+ out = F.relu(self.conv1_2(out)) # [N, 64, 300, 300]
95
+ out = self.pool1(out) # [N, 64, 150, 150]
96
+
97
+ out = F.relu(self.conv2_1(out)) # [N, 128, 150, 150]
98
+ out = F.relu(self.conv2_2(out)) # [N, 128, 150, 150]
99
+ out = self.pool2(out) # [N, 128, 75, 75]
100
+
101
+ out = F.relu(self.conv3_1(out)) # [N, 256, 75, 75]
102
+ out = F.relu(self.conv3_2(out)) # [N, 256, 75, 75]
103
+ out = F.relu(self.conv3_3(out)) # [N, 256, 75, 75]
104
+ conv3_3_feats = out
105
+ out = self.pool3(out) # [N, 256, 38, 38] không phải [N, 256, 37, 37] do ceiling mode = True
106
+
107
+ out = F.relu(self.conv4_1(out)) # [N, 512, 38, 38]
108
+ out = F.relu(self.conv4_2(out)) # [N, 512, 38, 38]
109
+ out = F.relu(self.conv4_3(out)) # [N, 512, 38, 38]
110
+ conv4_3_feats = out # [N, 512, 38, 38]
111
+ out = self.pool4(out) # [N, 512, 19, 19]
112
+
113
+ out = F.relu(self.conv5_1(out)) # [N, 512, 19, 19]
114
+ out = F.relu(self.conv5_2(out)) # [N, 512, 19, 19]
115
+ out = F.relu(self.conv5_3(out)) # [N, 512, 19, 19]
116
+ out = self.pool5(out) # [N, 512, 19, 19], layer pooling này không làm thay đổi size features map
117
+
118
+ out = F.relu(self.conv6(out)) # [N, 1024, 19, 19]
119
+
120
+ conv7_feats = F.relu(self.conv7(out)) # [N, 1024, 19, 19]
121
+
122
+ return conv3_3_feats, conv4_3_feats, conv7_feats
123
+
124
+
125
+ class AuxiliraryConvolutions(nn.Module):
126
+ """ Sau base network (vgg16) sẽ là các lớp conv phụ trợ
127
+ Feature Pyramid Network
128
+ """
129
+
130
+ def __init__(self):
131
+ super().__init__()
132
+
133
+ self.conv8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=0)
134
+ self.conv8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
135
+
136
+ self.conv9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=0)
137
+ self.conv9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
138
+
139
+ self.conv10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
140
+ self.conv10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
141
+
142
+ self.conv11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
143
+ self.conv11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
144
+
145
+ def init_conv2d(self):
146
+ """
147
+ Initialize convolution parameters.
148
+ """
149
+ for c in self.children():
150
+ if isinstance(c, nn.Conv2d):
151
+ nn.init.kaiming_uniform_(c.weight, nonlinearity='relu')
152
+ if c.bias is not None:
153
+ nn.init.constant_(c.bias, 0.)
154
+
155
+ def forward(self, conv7_feats):
156
+ """
157
+ :param conv8_feats, tensor [N, 1024, 19, 19]
158
+ """
159
+
160
+ out = F.relu(self.conv8_1(conv7_feats)) # [N, 256, 19, 19]
161
+ out = F.relu(self.conv8_2(out)) # [N, 512, 10, 10]
162
+ conv8_2_feats = out # [N, 512, 10, 10]
163
+
164
+ out = F.relu(self.conv9_1(out)) # [N, 128, 10, 10]
165
+ out = F.relu(self.conv9_2(out)) # [N, 256, 5, 5]
166
+ conv9_2_feats = out # [N, 256, 5, 5]
167
+
168
+ out = F.relu(self.conv10_1(out)) # [N, 128, 5, 5]
169
+ out = F.relu(self.conv10_2(out)) # [N, 256, 3, 3]
170
+ conv10_2_feats = out # [N, 256, 3, 3]
171
+
172
+ out = F.relu(self.conv11_1(out)) # [N, 128, 3, 3]
173
+ conv11_2_feats = F.relu(self.conv11_2(out)) # [N, 256, 1, 1]
174
+
175
+ return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
176
+
177
+ class FPNConvolutions(nn.Module):
178
+ """
179
+ conv3_3_feats : [N, 256, 75, 75]
180
+ conv4_3_feats : [N, 512, 38, 38]
181
+ conv7_feats : [N, 1024, 19, 19]
182
+ conv8_2_feats : [N, 512, 10, 10]
183
+ conv9_2_feats : [N, 256, 5, 5]
184
+ conv10_2_feats : [N, 256, 3, 3]
185
+ conv11_2_feats : [N, 256, 1, 1]
186
+ """
187
+
188
+ def __init__(self):
189
+ super().__init__()
190
+
191
+ self.fp6_upsample = nn.Upsample(scale_factor=3, mode="bilinear")
192
+ self.fp6_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
193
+ self.fp6_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
194
+
195
+ self.fp5_upsample = nn.Upsample(scale_factor=5/3, mode="bilinear")
196
+ self.fp5_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
197
+ self.fp5_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
198
+
199
+ self.fp4_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
200
+ self.fp4_conv1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1)
201
+ self.fp4_conv2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=1)
202
+
203
+ self.fp3_upsample = nn.Upsample(scale_factor=1.9, mode="bilinear")
204
+ self.fp3_conv1 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1)
205
+ self.fp3_conv2 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
206
+
207
+ self.fp2_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
208
+ self.fp2_conv1 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1)
209
+ self.fp2_conv2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=1)
210
+
211
+ self.fp1_upsample = nn.Upsample(scale_factor=75/38, mode="bilinear")
212
+ self.fp1_conv1 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1)
213
+ self.fp1_conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1)
214
+
215
+ def init_conv2d(self):
216
+ """
217
+ Initialize convolution parameters.
218
+ """
219
+ for c in self.children():
220
+ if isinstance(c, nn.Conv2d):
221
+ nn.init.kaiming_uniform_(c.weight, nonlinearity='relu')
222
+ if c.bias is not None:
223
+ nn.init.constant_(c.bias, 0.)
224
+
225
+ def forward(self, conv3_3_feats, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats ,conv11_2_feats):
226
+
227
+ fp7_feats = conv11_2_feats
228
+
229
+ out = self.fp6_upsample(conv11_2_feats)
230
+ out = F.relu(self.fp6_conv1(out) + self.fp6_conv2(conv10_2_feats))
231
+ fp6_feats = out
232
+
233
+ out = self.fp5_upsample(out)
234
+ out = F.relu(self.fp5_conv1(out) + self.fp5_conv2(conv9_2_feats))
235
+ fp5_feats = out
236
+
237
+ out = self.fp4_upsample(out)
238
+ out = F.relu(self.fp4_conv1(out) + self.fp4_conv2(conv8_2_feats))
239
+ fp4_feats = out
240
+
241
+ out = self.fp3_upsample(out)
242
+ out = F.relu(self.fp3_conv1(out) + self.fp3_conv2(conv7_feats))
243
+ fp3_feats = out
244
+
245
+ out = self.fp2_upsample(out)
246
+ out = F.relu(self.fp2_conv1(out) + self.fp2_conv2(conv4_3_feats))
247
+ fp2_feats = out
248
+
249
+ out = self.fp1_upsample(out)
250
+ fp1_feats = F.relu(self.fp1_conv1(out) + self.fp1_conv2(conv3_3_feats))
251
+
252
+ return fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats, fp7_feats
253
+
254
+ class PredictionConvolutions(nn.Module):
255
+ """Layer cuối là để predict offset và conf
256
+
257
+ """
258
+
259
+ def __init__(self, n_classes=21):
260
+ super().__init__()
261
+
262
+ self.n_classes = n_classes
263
+
264
+ n_boxes={
265
+ 'fp1' : 4,
266
+ 'fp2' : 4,
267
+ 'fp3' : 6,
268
+ 'fp4' : 6,
269
+ 'fp5' : 6,
270
+ 'fp6' : 4,
271
+ 'fp7' : 4,
272
+ }
273
+
274
+ # kernel size = 3 và padding = 1 không làm thay đổi kích thước feature map
275
+
276
+ self.loc_fp7 = nn.Conv2d(256, n_boxes['fp7']*4, kernel_size=3, padding=1)
277
+ self.loc_fp6 = nn.Conv2d(256, n_boxes['fp6']*4, kernel_size=3, padding=1)
278
+ self.loc_fp5 = nn.Conv2d(256, n_boxes['fp5']*4, kernel_size=3, padding=1)
279
+ self.loc_fp4 = nn.Conv2d(512, n_boxes['fp4']*4, kernel_size=3, padding=1)
280
+ self.loc_fp3 = nn.Conv2d(1024, n_boxes['fp3']*4, kernel_size=3, padding=1)
281
+ self.loc_fp2 = nn.Conv2d(512, n_boxes['fp2']*4, kernel_size=3, padding=1)
282
+ self.loc_fp1 = nn.Conv2d(256, n_boxes['fp1']*4, kernel_size=3, padding=1)
283
+
284
+
285
+ self.conf_fp7 = nn.Conv2d(256, n_boxes['fp7']*n_classes, kernel_size=3, padding=1)
286
+ self.conf_fp6 = nn.Conv2d(256, n_boxes['fp6']*n_classes, kernel_size=3, padding=1)
287
+ self.conf_fp5 = nn.Conv2d(256, n_boxes['fp5']*n_classes, kernel_size=3, padding=1)
288
+ self.conf_fp4 = nn.Conv2d(512, n_boxes['fp4']*n_classes, kernel_size=3, padding=1)
289
+ self.conf_fp3 = nn.Conv2d(1024, n_boxes['fp3']*n_classes, kernel_size=3, padding=1)
290
+ self.conf_fp2 = nn.Conv2d(512, n_boxes['fp2']*n_classes, kernel_size=3, padding=1)
291
+ self.conf_fp1 = nn.Conv2d(256, n_boxes['fp1']*n_classes, kernel_size=3, padding=1)
292
+
293
+ def init_conv2d(self):
294
+ """
295
+ Initialize convolution parameters.
296
+ """
297
+ for c in self.children():
298
+ if isinstance(c, nn.Conv2d):
299
+ nn.init.kaiming_uniform_(c.weight, nonlinearity='relu')
300
+ if c.bias is not None:
301
+ nn.init.constant_(c.bias, 0.)
302
+
303
+
304
+ def forward(self, fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats, fp7_feats):
305
+
306
+ batch_size = fp1_feats.shape[0]
307
+
308
+
309
+ loc_fp1 = self.loc_fp1(fp1_feats)
310
+ loc_fp1 = loc_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
311
+
312
+ loc_fp2 = self.loc_fp2(fp2_feats)
313
+ loc_fp2 = loc_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
314
+
315
+ loc_fp3 = self.loc_fp3(fp3_feats)
316
+ loc_fp3 = loc_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
317
+
318
+ loc_fp4 = self.loc_fp4(fp4_feats)
319
+ loc_fp4 = loc_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
320
+
321
+ loc_fp5 = self.loc_fp5(fp5_feats)
322
+ loc_fp5 = loc_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
323
+
324
+ loc_fp6 = self.loc_fp6(fp6_feats)
325
+ loc_fp6 = loc_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
326
+
327
+ loc_fp7 = self.loc_fp7(fp7_feats)
328
+ loc_fp7 = loc_fp7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
329
+
330
+
331
+ conf_fp1 = self.conf_fp1(fp1_feats)
332
+ conf_fp1 = conf_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
333
+
334
+ conf_fp2 = self.conf_fp2(fp2_feats)
335
+ conf_fp2 = conf_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
336
+
337
+ conf_fp3 = self.conf_fp3(fp3_feats)
338
+ conf_fp3 = conf_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
339
+
340
+ conf_fp4 = self.conf_fp4(fp4_feats)
341
+ conf_fp4 = conf_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
342
+
343
+ conf_fp5 = self.conf_fp5(fp5_feats)
344
+ conf_fp5 = conf_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
345
+
346
+ conf_fp6 = self.conf_fp6(fp6_feats)
347
+ conf_fp6 = conf_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
348
+
349
+ conf_fp7 = self.conf_fp7(fp7_feats)
350
+ conf_fp7 = conf_fp7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
351
+
352
+ loc = torch.cat((loc_fp1, loc_fp2, loc_fp3, loc_fp4, loc_fp5, loc_fp6, loc_fp7), dim=1)
353
+ conf = torch.cat((conf_fp1, conf_fp2, conf_fp3, conf_fp4, conf_fp5, conf_fp6, conf_fp7), dim=1)
354
+
355
+ return loc, conf
356
+
357
+
358
+ class L2Norm(nn.Module):
359
+ def __init__(self, input_channel, scale=20):
360
+ super().__init__()
361
+ self.scale_factors = nn.Parameter(torch.FloatTensor(1, input_channel, 1, 1))
362
+ self.eps = 1e-10
363
+ nn.init.constant_(self.scale_factors, scale)
364
+
365
+ def forward(self, tensor):
366
+ norm = tensor.pow(2).sum(dim=1, keepdim=True).sqrt()
367
+ tensor = tensor/(norm + self.eps)*self.scale_factors
368
+ return tensor
369
+
370
+ class FPN_SSD300(nn.Module):
371
+
372
+ def __init__(self, pretrain_path = None, data_train_on = "VOC", n_classes = 21):
373
+ super().__init__()
374
+
375
+ self.n_classes = n_classes
376
+ self.data_train_on = data_train_on
377
+ self.base_net = VGG16Base()
378
+ self.auxi_conv = AuxiliraryConvolutions()
379
+ self.fp_conv = FPNConvolutions()
380
+ self.pred_conv = PredictionConvolutions(n_classes)
381
+ self.l2_conv3_3 = L2Norm(input_channel=256)
382
+ self.l2_conv4_3 = L2Norm(input_channel=512)
383
+
384
+ if pretrain_path is not None:
385
+ self.load_state_dict(torch.load(pretrain_path))
386
+ else:
387
+ self.base_net.load_pretrain()
388
+ self.auxi_conv.init_conv2d()
389
+ self.fp_conv.init_conv2d()
390
+ self.pred_conv.init_conv2d()
391
+
392
+ def create_prior_boxes(self):
393
+ """
394
+ mỗi box có dạng [cx, cy, w, h] được scale
395
+ """
396
+ # kích thước feature map tương ứng
397
+ fmap_sizes = [75, 38, 19, 10, 5, 3, 1]
398
+
399
+ # scale như trong paper và được tính sẵn thay vì công thức
400
+ # lưu ý ở conv4_3, tác giả xét như một trường hợp đặc biệt (scale 0.1):
401
+ # Ở mục 3.1, trang 7 :
402
+ # "We set default box with scale 0.1 on conv4 3 .... "
403
+ # "For SSD512 model, we add extra conv12 2 for prediction, set smin to 0.15, and 0.07 on conv4 3...""
404
+
405
+ if self.data_train_on == "VOC":
406
+ box_scales = [0.1, 0.15, 0.2, 0.375, 0.55, 0.725, 0.9]
407
+ elif self.data_train_on == "COCO":
408
+ box_scales = [0.07, 0.11, 0.15, 0.3375, 0.525, 0.7125, 0.9]
409
+
410
+ aspect_ratios = [
411
+ [1., 2., 0.5],
412
+ [1., 2., 0.5],
413
+ [1., 2., 3., 0.5, 0.333],
414
+ [1., 2., 3., 0.5, 0.333],
415
+ [1., 2., 3., 0.5, 0.333],
416
+ [1., 2., 0.5],
417
+ [1., 2., 0.5]
418
+ ]
419
+ dboxes = []
420
+
421
+
422
+ for idx, fmap_size in enumerate(fmap_sizes):
423
+ for i in range(fmap_size):
424
+ for j in range(fmap_size):
425
+
426
+ # lưu ý, cx trong ảnh là trục hoành, do đó j + 0.5 chứ không phải i + 0.5
427
+ cx = (j + 0.5) / fmap_size
428
+ cy = (i + 0.5) / fmap_size
429
+
430
+ for aspect_ratio in aspect_ratios[idx]:
431
+ scale = box_scales[idx]
432
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
433
+
434
+ if aspect_ratio == 1.:
435
+ try:
436
+ scale = sqrt(scale*box_scales[idx + 1])
437
+ except IndexError:
438
+ scale = 1.
439
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
440
+
441
+ dboxes = torch.FloatTensor(dboxes)
442
+
443
+ #dboxes = pascalVOC_style(dboxes)
444
+ #dboxes.clamp_(0, 1)
445
+ #dboxes = yolo_style(dboxes)
446
+
447
+ return dboxes
448
+
449
+ def forward(self, images):
450
+ conv3_3_feats, conv4_3_feats, conv7_feats = self.base_net(images)
451
+ conv3_3_feats = self.l2_conv3_3(conv3_3_feats)
452
+ conv4_3_feats = self.l2_conv4_3(conv4_3_feats)
453
+ conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.auxi_conv(conv7_feats)
454
+
455
+ FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats, FP7_feats = self.fp_conv(conv3_3_feats, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
456
+
457
+ loc, conf = self.pred_conv(FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats, FP7_feats)
458
+ return loc, conf
459
+
460
+
461
+
462
+ if __name__ == "__main__":
463
+ T = FPN_SSD300()
464
+ img = torch.ones(1, 3, 300, 300)
465
+ loc, conf = T(img)
466
+ print(loc.shape)
467
+ print(conf.shape)
FPN_SSD512.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.lib import *
2
+
3
+ class VGG16Base(nn.Module):
4
+ """
5
+ Lấy VGG16 làm base network, tuy nhiên cần có một vài thay đổi:
6
+ - Đầu vào ảnh là 512x512 thay vì 224x224, các comment bên dưới sẽ áp dụng cho đầu vào 512x512
7
+ - Lớp pooling thứ 3 sử dụng ceiling mode thay vì floor mode
8
+ - Lớp pooling thứ 5 kernel size (2, 2) -> (3, 3) và stride 2 -> 1, và padding = 1
9
+ - Ta downsample (decimate) parameter fc6 và fc7 để tạo thành conv6 và conv7, loại bỏ hoàn toàn fc8
10
+ """
11
+
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 64, kernel_size=3, padding=1)
16
+ self.conv1_2 = nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size=3, padding=1)
17
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
18
+
19
+ self.conv2_1 = nn.Conv2d(in_channels= 64, out_channels=128, kernel_size=3, padding=1)
20
+ self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
21
+ self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
22
+
23
+ self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
24
+ self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
25
+ self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
26
+ self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
27
+
28
+ self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
29
+ self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
30
+ self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
31
+ self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
32
+
33
+ self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
34
+ self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
35
+ self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
36
+ self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
37
+
38
+ # Không còn fc layers nữa, thay vào đó là conv6 và conv7
39
+ # atrous
40
+ self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6)
41
+ self.conv7 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
42
+
43
+ def decimate(self, tensor, steps):
44
+ assert(len(steps) == tensor.dim())
45
+
46
+ for i in range(tensor.dim()):
47
+ if steps[i] is not None:
48
+ tensor = tensor.index_select(dim=i, index=torch.arange(start=0, end=tensor.shape[i], step=steps[i]))
49
+
50
+ return tensor
51
+
52
+
53
+ def load_pretrain(self):
54
+ """
55
+ load pretrain từ thư viện pytorch, decimate param lại để phù hợp với conv6 và conv7
56
+ """
57
+
58
+ state_dict = self.state_dict()
59
+ param_names = list(state_dict.keys())
60
+
61
+ # old version : torch.vision.models.vgg16(pretrain=True)
62
+ # Load model theo API mới của pytorch, cụ thể hơn tại : https://pytorch.org/vision/stable/models.html
63
+ pretrain_state_dict = torchvision.models.vgg16(weights='VGG16_Weights.DEFAULT').state_dict()
64
+ pretrain_param_names = list(pretrain_state_dict.keys())
65
+
66
+ # Pretrain param name và custom param name không giống nhau, các param chỉ cùng thứ tự như trong architecture
67
+ for idx, param_name in enumerate(param_names[:-4]): # 4 param cuối là weight và bias của conv6 và conv7, sẽ xử lí sau
68
+ state_dict[param_name] = pretrain_state_dict[pretrain_param_names[idx]]
69
+
70
+ # fc -> conv
71
+ fc6_weight = pretrain_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)
72
+ fc6_bias = pretrain_state_dict['classifier.0.bias'].view(4096)
73
+
74
+ fc7_weight = pretrain_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)
75
+ fc7_bias = pretrain_state_dict['classifier.3.bias'].view(4096)
76
+
77
+ # downsample parameter
78
+ state_dict['conv6.weight'] = self.decimate(fc6_weight, steps=[4, None, 3, 3])
79
+ state_dict['conv6.bias'] = self.decimate(fc6_bias, steps=[4])
80
+
81
+ state_dict['conv7.weight'] = self.decimate(fc7_weight, steps=[4, 4, None, None])
82
+ state_dict['conv7.bias'] = self.decimate(fc7_bias, steps=[4])
83
+
84
+ self.load_state_dict(state_dict)
85
+
86
+
87
+ def forward(self, images):
88
+ """
89
+ :param images, tensor [N, 3, 512, 512]
90
+
91
+ return:
92
+ """
93
+ out = F.relu(self.conv1_1(images)) # [N, 64, 512, 512]
94
+ out = F.relu(self.conv1_2(out)) # [N, 64, 512, 512]
95
+ out = self.pool1(out) # [N, 64, 256, 256]
96
+
97
+ out = F.relu(self.conv2_1(out)) # [N, 128, 256, 256]
98
+ out = F.relu(self.conv2_2(out)) # [N, 128, 256, 256]
99
+ out = self.pool2(out) # [N, 128, 128, 128]
100
+
101
+ out = F.relu(self.conv3_1(out)) # [N, 256, 128, 128]
102
+ out = F.relu(self.conv3_2(out)) # [N, 256, 128, 128]
103
+ out = F.relu(self.conv3_3(out)) # [N, 256, 128, 128]
104
+ out = self.pool3(out) # [N, 256, 64, 64]
105
+
106
+ out = F.relu(self.conv4_1(out)) # [N, 512, 64, 64]
107
+ out = F.relu(self.conv4_2(out)) # [N, 512, 64, 64]
108
+ out = F.relu(self.conv4_3(out)) # [N, 512, 64, 64]
109
+ conv4_3_feats = out # [N, 512, 64, 64]
110
+ out = self.pool4(out) # [N, 512, 32, 32]
111
+
112
+ out = F.relu(self.conv5_1(out)) # [N, 512, 32, 32]
113
+ out = F.relu(self.conv5_2(out)) # [N, 512, 32, 32]
114
+ out = F.relu(self.conv5_3(out)) # [N, 512, 32, 32]
115
+ out = self.pool5(out) # [N, 512, 32, 32], layer pooling này không làm thay đổi size features map
116
+
117
+ out = F.relu(self.conv6(out)) # [N, 1024, 32, 32]
118
+
119
+ conv7_feats = F.relu(self.conv7(out)) # [N, 1024, 32, 32]
120
+
121
+ return conv4_3_feats, conv7_feats # [N, 512, 64, 64], [N, 1024, 32, 32]
122
+
123
+
124
+ class AuxiliraryConvolutions(nn.Module):
125
+
126
+ def __init__(self):
127
+ super().__init__()
128
+
129
+ self.conv8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=0)
130
+ self.conv8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
131
+
132
+ self.conv9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=0)
133
+ self.conv9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
134
+
135
+ self.conv10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
136
+ self.conv10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
137
+
138
+ self.conv11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
139
+ self.conv11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
140
+
141
+ self.conv12_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
142
+ self.conv12_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=4, padding=1)
143
+
144
+
145
+ def init_conv2d(self):
146
+ """
147
+ Initialize convolution parameters.
148
+ """
149
+ for c in self.children():
150
+ if isinstance(c, nn.Conv2d):
151
+ nn.init.xavier_uniform_(c.weight)
152
+ if c.bias is not None:
153
+ nn.init.constant_(c.bias, 0.)
154
+
155
+ def forward(self, conv7_feats):
156
+ """
157
+ :param conv8_feats, tensor [N, 1024, 32, 32]
158
+ """
159
+
160
+ out = F.relu(self.conv8_1(conv7_feats)) # [N, 256, 32, 32]
161
+ out = F.relu(self.conv8_2(out)) # [N, 512, 16, 16]
162
+ conv8_2_feats = out # [N, 512, 16, 16]
163
+
164
+ out = F.relu(self.conv9_1(out)) # [N, 128, 16, 16]
165
+ out = F.relu(self.conv9_2(out)) # [N, 256, 8, 8]
166
+ conv9_2_feats = out # [N, 256, 8, 8]
167
+
168
+ out = F.relu(self.conv10_1(out)) # [N, 128, 8, 8]
169
+ out = F.relu(self.conv10_2(out)) # [N, 256, 4, 4]
170
+ conv10_2_feats = out # [N, 256, 4, 4]
171
+
172
+ out = F.relu(self.conv11_1(out)) # [N, 128, 4, 4]
173
+ out = F.relu(self.conv11_2(out)) # [N, 256, 2, 2]
174
+ conv11_2_feats = out
175
+
176
+ out = F.relu(self.conv12_1(out)) # [N, 128, 2, 2]
177
+ out = F.relu(self.conv12_2(out)) # [N, 256, 1, 1]
178
+ conv12_2_feats = out
179
+
180
+ return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats
181
+
182
+
183
+ class FPNConvolutions(nn.Module):
184
+ """
185
+ conv3_3_feats : [N, 256, 128, 128]
186
+ conv4_3_feats : [N, 512, 64, 64]
187
+ conv7_feats : [N, 1024, 32, 32]
188
+ conv8_2_feats : [N, 512, 16, 16]
189
+ conv9_2_feats : [N, 256, 8, 8]
190
+ conv10_2_feats : [N, 256, 4, 4]
191
+ conv11_2_feats : [N, 256, 2, 2]
192
+ conv12_2_feats : [N, 256, 1, 1]
193
+ """
194
+
195
+ def __init__(self):
196
+ super().__init__()
197
+
198
+ self.fp6_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
199
+ self.fp6_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, bias=False)
200
+ self.fp6_bn = nn.BatchNorm2d(num_features=256)
201
+
202
+ self.fp5_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
203
+ self.fp5_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, bias=False)
204
+ self.fp5_bn = nn.BatchNorm2d(num_features=256)
205
+
206
+ self.fp4_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
207
+ self.fp4_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, bias=False)
208
+ self.fp4_bn = nn.BatchNorm2d(num_features=256)
209
+
210
+ self.fp3_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
211
+ self.fp3_conv1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1, bias=False)
212
+ self.fp3_bn = nn.BatchNorm2d(num_features=512)
213
+
214
+ self.fp2_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
215
+ self.fp2_conv1 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1, bias=False)
216
+ self.fp2_bn = nn.BatchNorm2d(num_features=1024)
217
+
218
+ self.fp1_upsample = nn.Upsample(scale_factor=2, mode="bilinear")
219
+ self.fp1_conv1 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, bias=False)
220
+ self.fp1_bn = nn.BatchNorm2d(num_features=512)
221
+
222
+ def init_conv2d(self):
223
+ """
224
+ Initialize convolution parameters.
225
+ """
226
+ for c in self.children():
227
+ if isinstance(c, nn.Conv2d):
228
+ nn.init.xavier_uniform_(c.weight)
229
+ if c.bias is not None:
230
+ nn.init.constant_(c.bias, 0.)
231
+
232
+ def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats):
233
+
234
+ fp7_feats = conv12_2_feats
235
+
236
+ out = self.fp6_upsample(conv12_2_feats)
237
+ out = self.fp6_conv1(out)
238
+ out = F.relu(out + conv11_2_feats)
239
+ fp6_feats = self.fp6_bn(out)
240
+
241
+ out = self.fp5_upsample(out)
242
+ out = self.fp5_conv1(out)
243
+ out = F.relu(out + conv10_2_feats)
244
+ fp5_feats = self.fp5_bn(out)
245
+
246
+ out = self.fp4_upsample(out)
247
+ out = self.fp4_conv1(out)
248
+ out = F.relu(out + conv9_2_feats)
249
+ fp4_feats = self.fp4_bn(out)
250
+
251
+ out = self.fp3_upsample(out)
252
+ out = self.fp3_conv1(out)
253
+ out = F.relu(out + conv8_2_feats)
254
+ fp3_feats = self.fp3_bn(out)
255
+
256
+ out = self.fp2_upsample(out)
257
+ out = self.fp2_conv1(out)
258
+ out = F.relu(out + conv7_feats)
259
+ fp2_feats = self.fp2_bn(out)
260
+
261
+ out = self.fp1_upsample(out)
262
+ out = self.fp1_conv1(out)
263
+ out = F.relu(out + conv4_3_feats)
264
+ fp1_feats = self.fp1_bn(out)
265
+
266
+ return fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats, fp7_feats
267
+
268
+
269
+ class PredictionConvolutions(nn.Module):
270
+
271
+ def __init__(self, n_classes=21):
272
+ super().__init__()
273
+
274
+ self.n_classes = n_classes
275
+
276
+ n_boxes={
277
+ 'fp1' : 4,
278
+ 'fp2' : 6,
279
+ 'fp3' : 6,
280
+ 'fp4' : 6,
281
+ 'fp5' : 6,
282
+ 'fp6' : 4,
283
+ 'fp7' : 4
284
+ }
285
+
286
+ # kernel size = 3 và padding = 1 không làm thay đổi kích thước feature map
287
+
288
+ self.loc_fp1 = nn.Conv2d(512, n_boxes['fp1']*4, kernel_size=3, padding=1)
289
+ self.loc_fp2 = nn.Conv2d(1024, n_boxes['fp2']*4, kernel_size=3, padding=1)
290
+ self.loc_fp3 = nn.Conv2d(512, n_boxes['fp3']*4, kernel_size=3, padding=1)
291
+ self.loc_fp4 = nn.Conv2d(256, n_boxes['fp4']*4, kernel_size=3, padding=1)
292
+ self.loc_fp5 = nn.Conv2d(256, n_boxes['fp5']*4, kernel_size=3, padding=1)
293
+ self.loc_fp6 = nn.Conv2d(256, n_boxes['fp6']*4, kernel_size=3, padding=1)
294
+ self.loc_fp7 = nn.Conv2d(256, n_boxes['fp7']*4, kernel_size=3, padding=1)
295
+
296
+
297
+ self.conf_fp1 = nn.Conv2d(512, n_boxes['fp1']*n_classes, kernel_size=3, padding=1)
298
+ self.conf_fp2 = nn.Conv2d(1024, n_boxes['fp2']*n_classes, kernel_size=3, padding=1)
299
+ self.conf_fp3 = nn.Conv2d(512, n_boxes['fp3']*n_classes, kernel_size=3, padding=1)
300
+ self.conf_fp4 = nn.Conv2d(256, n_boxes['fp4']*n_classes, kernel_size=3, padding=1)
301
+ self.conf_fp5 = nn.Conv2d(256, n_boxes['fp5']*n_classes, kernel_size=3, padding=1)
302
+ self.conf_fp6 = nn.Conv2d(256, n_boxes['fp6']*n_classes, kernel_size=3, padding=1)
303
+ self.conf_fp7 = nn.Conv2d(256, n_boxes['fp7']*n_classes, kernel_size=3, padding=1)
304
+
305
+
306
+ def init_conv2d(self):
307
+ """
308
+ Initialize convolution parameters.
309
+ """
310
+ for c in self.children():
311
+ if isinstance(c, nn.Conv2d):
312
+ nn.init.xavier_uniform_(c.weight)
313
+ if c.bias is not None:
314
+ nn.init.constant_(c.bias, 0.)
315
+
316
+
317
+ def forward(self, fp1_feats, fp2_feats, fp3_feats, fp4_feats, fp5_feats, fp6_feats, fp7_feats):
318
+
319
+ batch_size = fp1_feats.shape[0]
320
+
321
+
322
+ loc_fp1 = self.loc_fp1(fp1_feats)
323
+ loc_fp1 = loc_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
324
+
325
+ loc_fp2 = self.loc_fp2(fp2_feats)
326
+ loc_fp2 = loc_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
327
+
328
+ loc_fp3 = self.loc_fp3(fp3_feats)
329
+ loc_fp3 = loc_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
330
+
331
+ loc_fp4 = self.loc_fp4(fp4_feats)
332
+ loc_fp4 = loc_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
333
+
334
+ loc_fp5 = self.loc_fp5(fp5_feats)
335
+ loc_fp5 = loc_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
336
+
337
+ loc_fp6 = self.loc_fp6(fp6_feats)
338
+ loc_fp6 = loc_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
339
+
340
+ loc_fp7 = self.loc_fp7(fp7_feats)
341
+ loc_fp7 = loc_fp7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
342
+
343
+
344
+
345
+ conf_fp1 = self.conf_fp1(fp1_feats)
346
+ conf_fp1 = conf_fp1.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
347
+
348
+ conf_fp2 = self.conf_fp2(fp2_feats)
349
+ conf_fp2 = conf_fp2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
350
+
351
+ conf_fp3 = self.conf_fp3(fp3_feats)
352
+ conf_fp3 = conf_fp3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
353
+
354
+ conf_fp4 = self.conf_fp4(fp4_feats)
355
+ conf_fp4 = conf_fp4.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
356
+
357
+ conf_fp5 = self.conf_fp5(fp5_feats)
358
+ conf_fp5 = conf_fp5.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
359
+
360
+ conf_fp6 = self.conf_fp6(fp6_feats)
361
+ conf_fp6 = conf_fp6.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
362
+
363
+ conf_fp7 = self.conf_fp7(fp7_feats)
364
+ conf_fp7 = conf_fp7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
365
+
366
+ loc = torch.cat((loc_fp1, loc_fp2, loc_fp3, loc_fp4, loc_fp5, loc_fp6, loc_fp7), dim=1)
367
+ conf = torch.cat((conf_fp1, conf_fp2, conf_fp3, conf_fp4, conf_fp5, conf_fp6, conf_fp7), dim=1)
368
+
369
+ return loc, conf
370
+
371
+ class L2Norm(nn.Module):
372
+ def __init__(self, input_channel=512, scale=20):
373
+ super().__init__()
374
+ self.scale_factors = nn.Parameter(torch.FloatTensor(1, input_channel, 1, 1))
375
+ self.eps = 1e-10
376
+ nn.init.constant_(self.scale_factors, scale)
377
+
378
+ def forward(self, tensor):
379
+ norm = tensor.pow(2).sum(dim=1, keepdim=True).sqrt()
380
+ tensor = tensor/(norm + self.eps)*self.scale_factors
381
+ return tensor
382
+
383
+ class FPN_SSD512(nn.Module):
384
+
385
+ def __init__(self, pretrain_path = None, data_train_on = "VOC", n_classes = 21):
386
+ super().__init__()
387
+
388
+ self.n_classes = n_classes
389
+ self.data_train_on = data_train_on
390
+ self.base_net = VGG16Base()
391
+ self.auxi_conv = AuxiliraryConvolutions()
392
+ self.pred_conv = PredictionConvolutions(n_classes)
393
+ self.fp_conv = FPNConvolutions()
394
+ self.l2_conv4_3 = L2Norm(input_channel=512)
395
+
396
+ if pretrain_path is not None:
397
+ self.load_state_dict(torch.load(pretrain_path))
398
+ else:
399
+ self.base_net.load_pretrain()
400
+ self.auxi_conv.init_conv2d()
401
+ self.fp_conv.init_conv2d()
402
+ self.pred_conv.init_conv2d()
403
+
404
+ def create_prior_boxes(self):
405
+ """
406
+ Tạo prior boxes (tensor) như trong paper
407
+ mỗi box có dạng [cx, cy, w, h] được scale
408
+ """
409
+ # kích thước feature map tương ứng
410
+ fmap_sizes = [64, 32, 16, 8, 4, 2, 1]
411
+
412
+ # scale như trong paper và được tính sẵn thay vì công thức
413
+ # lưu ý ở conv4_3, tác giả xét như một trường hợp đặc biệt (scale 0.1):
414
+ # Ở mục 3.1, trang 7 :
415
+ # "We set default box with scale 0.1 on conv4 3 .... "
416
+ # "For SSD512 model, we add extra conv12 2 for prediction, set smin to 0.15, and 0.07 on conv4 3...""
417
+
418
+ if self.data_train_on == "VOC":
419
+ box_scales = [0.07, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9]
420
+ elif self.data_train_on == "COCO":
421
+ box_scales = [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9]
422
+
423
+ aspect_ratios = [
424
+ [1., 2., 0.5],
425
+ [1., 2., 3., 0.5, 0.333],
426
+ [1., 2., 3., 0.5, 0.333],
427
+ [1., 2., 3., 0.5, 0.333],
428
+ [1., 2., 3., 0.5, 0.333],
429
+ [1., 2., 0.5],
430
+ [1., 2., 0.5]
431
+ ]
432
+ dboxes = []
433
+
434
+
435
+ for idx, fmap_size in enumerate(fmap_sizes):
436
+ for i in range(fmap_size):
437
+ for j in range(fmap_size):
438
+
439
+ # lưu ý, cx trong ảnh là trục hoành, do đó j + 0.5 chứ không phải i + 0.5
440
+ cx = (j + 0.5) / fmap_size
441
+ cy = (i + 0.5) / fmap_size
442
+
443
+ for aspect_ratio in aspect_ratios[idx]:
444
+ scale = box_scales[idx]
445
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
446
+
447
+ if aspect_ratio == 1.:
448
+ try:
449
+ scale = sqrt(scale*box_scales[idx + 1])
450
+ except IndexError:
451
+ scale = 1.
452
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
453
+
454
+ dboxes = torch.FloatTensor(dboxes)
455
+
456
+ #dboxes = pascalVOC_style(dboxes)
457
+ dboxes.clamp_(0, 1)
458
+ #dboxes = yolo_style(dboxes)
459
+
460
+ return dboxes
461
+
462
+ def forward(self, images):
463
+ conv4_3_feats, conv7_feats = self.base_net(images)
464
+ conv4_3_feats = self.l2_conv4_3(conv4_3_feats)
465
+ conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats = self.auxi_conv(conv7_feats)
466
+
467
+ FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats, FP7_feats = self.fp_conv(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats)
468
+
469
+ loc, conf = self.pred_conv(FP1_feats, FP2_feats, FP3_feats, FP4_feats, FP5_feats, FP6_feats, FP7_feats)
470
+ return loc, conf
471
+
472
+
473
+ if __name__ == "__main__":
474
+ T = FPN_SSD512()
475
+ imgs = torch.Tensor(1, 3, 512, 512)
476
+ loc, conf = T(imgs)
477
+ print(loc.shape)
478
+ print(conf.shape)
479
+
480
+
SSD300.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.lib import *
2
+ from utils.box_utils import pascalVOC_style, yolo_style
3
+
4
+ class VGG16Base(nn.Module):
5
+ """
6
+ Lấy VGG16 làm base network, tuy nhiên cần có một vài thay đổi:
7
+ - Đầu vào ảnh là 300x300 thay vì 224x224, các comment bên dưới sẽ áp dụng cho đầu vào 300x300
8
+ - Lớp pooling thứ 3 sử dụng ceiling mode thay vì floor mode
9
+ - Lớp pooling thứ 5 kernel size (2, 2) -> (3, 3) và stride 2 -> 1, và padding = 1
10
+ - Ta downsample (decimate) parameter fc6 và fc7 để tạo thành conv6 và conv7, loại bỏ hoàn toàn fc8
11
+ """
12
+
13
+ def __init__(self):
14
+ super().__init__()
15
+
16
+ self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 64, kernel_size=3, padding=1)
17
+ self.conv1_2 = nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size=3, padding=1)
18
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
19
+
20
+ self.conv2_1 = nn.Conv2d(in_channels= 64, out_channels=128, kernel_size=3, padding=1)
21
+ self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
22
+ self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
23
+
24
+ self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
25
+ self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
26
+ self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
27
+ self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
28
+
29
+ self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
30
+ self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
31
+ self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
32
+ self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
33
+
34
+ self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
35
+ self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
36
+ self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
37
+ self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
38
+
39
+ # Không còn fc layers nữa, thay vào đó là conv6 và conv7
40
+ # atrous
41
+ self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6)
42
+ self.conv7 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
43
+
44
+ def decimate(self, tensor, steps):
45
+ assert(len(steps) == tensor.dim())
46
+
47
+ for i in range(tensor.dim()):
48
+ if steps[i] is not None:
49
+ tensor = tensor.index_select(dim=i, index=torch.arange(start=0, end=tensor.shape[i], step=steps[i]))
50
+
51
+ return tensor
52
+
53
+
54
+ def load_pretrain(self):
55
+ """
56
+ load pretrain từ thư viện pytorch, decimate param lại để phù hợp với conv6 và conv7
57
+ """
58
+
59
+ state_dict = self.state_dict()
60
+ param_names = list(state_dict.keys())
61
+
62
+ # old version : torch.vision.models.vgg16(pretrain=True)
63
+ # Load model theo API mới của pytorch, cụ thể hơn tại : https://pytorch.org/vision/stable/models.html
64
+ pretrain_state_dict = torchvision.models.vgg16(weights='VGG16_Weights.DEFAULT').state_dict()
65
+ pretrain_param_names = list(pretrain_state_dict.keys())
66
+
67
+ # Pretrain param name và custom param name không giống nhau, các param chỉ cùng thứ tự như trong architecture
68
+ for idx, param_name in enumerate(param_names[:-4]): # 4 param cuối là weight và bias của conv6 và conv7, sẽ xử lí sau
69
+ state_dict[param_name] = pretrain_state_dict[pretrain_param_names[idx]]
70
+
71
+ # fc -> conv
72
+ fc6_weight = pretrain_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)
73
+ fc6_bias = pretrain_state_dict['classifier.0.bias'].view(4096)
74
+
75
+ fc7_weight = pretrain_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)
76
+ fc7_bias = pretrain_state_dict['classifier.3.bias'].view(4096)
77
+
78
+ # downsample parameter
79
+ state_dict['conv6.weight'] = self.decimate(fc6_weight, steps=[4, None, 3, 3])
80
+ state_dict['conv6.bias'] = self.decimate(fc6_bias, steps=[4])
81
+
82
+ state_dict['conv7.weight'] = self.decimate(fc7_weight, steps=[4, 4, None, None])
83
+ state_dict['conv7.bias'] = self.decimate(fc7_bias, steps=[4])
84
+
85
+ self.load_state_dict(state_dict)
86
+
87
+
88
+ def forward(self, images):
89
+ """
90
+ :param images, tensor [N, 3, 300, 300]
91
+
92
+ return:
93
+ """
94
+ out = F.relu(self.conv1_1(images)) # [N, 64, 300, 300]
95
+ out = F.relu(self.conv1_2(out)) # [N, 64, 300, 300]
96
+ out = self.pool1(out) # [N, 64, 150, 150]
97
+
98
+ out = F.relu(self.conv2_1(out)) # [N, 128, 150, 150]
99
+ out = F.relu(self.conv2_2(out)) # [N, 128, 150, 150]
100
+ out = self.pool2(out) # [N, 128, 75, 75]
101
+
102
+ out = F.relu(self.conv3_1(out)) # [N, 256, 75, 75]
103
+ out = F.relu(self.conv3_2(out)) # [N, 256, 75, 75]
104
+ out = F.relu(self.conv3_3(out)) # [N, 256, 75, 75]
105
+ out = self.pool3(out) # [N, 256, 38, 38] không phải [N, 256, 37, 37] do ceiling mode = True
106
+
107
+ out = F.relu(self.conv4_1(out)) # [N, 512, 38, 38]
108
+ out = F.relu(self.conv4_2(out)) # [N, 512, 38, 38]
109
+ out = F.relu(self.conv4_3(out)) # [N, 512, 38, 38]
110
+ conv4_3_feats = out # [N, 512, 38, 38]
111
+ out = self.pool4(out) # [N, 512, 19, 19]
112
+
113
+ out = F.relu(self.conv5_1(out)) # [N, 512, 19, 19]
114
+ out = F.relu(self.conv5_2(out)) # [N, 512, 19, 19]
115
+ out = F.relu(self.conv5_3(out)) # [N, 512, 19, 19]
116
+ out = self.pool5(out) # [N, 512, 19, 19], layer pooling này không làm thay đổi size features map
117
+
118
+ out = F.relu(self.conv6(out)) # [N, 1024, 19, 19]
119
+
120
+ conv7_feats = F.relu(self.conv7(out)) # [N, 1024, 19, 19]
121
+
122
+ return conv4_3_feats, conv7_feats
123
+
124
+
125
+ class AuxiliraryConvolutions(nn.Module):
126
+ """ Sau base network (vgg16) sẽ là các lớp conv phụ trợ
127
+ """
128
+
129
+ def __init__(self):
130
+ super().__init__()
131
+
132
+ self.conv8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=0)
133
+ self.conv8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
134
+
135
+ self.conv9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=0)
136
+ self.conv9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
137
+
138
+ self.conv10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
139
+ self.conv10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
140
+
141
+ self.conv11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
142
+ self.conv11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=0)
143
+
144
+ def init_conv2d(self):
145
+ """
146
+ Initialize convolution parameters.
147
+ """
148
+ for c in self.children():
149
+ if isinstance(c, nn.Conv2d):
150
+ nn.init.xavier_uniform_(c.weight)
151
+ if c.bias is not None:
152
+ nn.init.constant_(c.bias, 0.)
153
+
154
+ def forward(self, conv7_feats):
155
+ """
156
+ :param conv8_feats, tensor [N, 1024, 19, 19]
157
+ """
158
+
159
+ out = F.relu(self.conv8_1(conv7_feats)) # [N, 256, 19, 19]
160
+ out = F.relu(self.conv8_2(out)) # [N, 512, 10, 10]
161
+ conv8_2_feats = out # [N, 512, 10, 10]
162
+
163
+ out = F.relu(self.conv9_1(out)) # [N, 128, 10, 10]
164
+ out = F.relu(self.conv9_2(out)) # [N, 256, 5, 5]
165
+ conv9_2_feats = out # [N, 256, 5, 5]
166
+
167
+ out = F.relu(self.conv10_1(out)) # [N, 128, 5, 5]
168
+ out = F.relu(self.conv10_2(out)) # [N, 256, 3, 3]
169
+ conv10_2_feats = out # [N, 256, 3, 3]
170
+
171
+ out = F.relu(self.conv11_1(out)) # [N, 128, 3, 3]
172
+ conv11_2_feats = F.relu(self.conv11_2(out)) # [N, 256, 1, 1]
173
+
174
+ return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
175
+
176
+
177
+ class PredictionConvolutions(nn.Module):
178
+ """Layer cuối là để predict offset và conf
179
+
180
+ """
181
+
182
+ def __init__(self, n_classes=21):
183
+ super().__init__()
184
+
185
+ self.n_classes = n_classes
186
+
187
+ n_boxes={
188
+ 'conv4_3' : 4,
189
+ 'conv7' : 6,
190
+ 'conv8_2' : 6,
191
+ 'conv9_2' : 6,
192
+ 'conv10_2' : 4,
193
+ 'conv11_2' : 4
194
+ }
195
+
196
+ # kernel size = 3 và padding = 1 không làm thay đổi kích thước feature map
197
+
198
+ self.loc_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3']*4, kernel_size=3, padding=1)
199
+ self.loc_conv7 = nn.Conv2d(1024, n_boxes['conv7']*4, kernel_size=3, padding=1)
200
+ self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2']*4, kernel_size=3, padding=1)
201
+ self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2']*4, kernel_size=3, padding=1)
202
+ self.loc_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2']*4, kernel_size=3, padding=1)
203
+ self.loc_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2']*4, kernel_size=3, padding=1)
204
+
205
+
206
+ self.conf_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3']*n_classes, kernel_size=3, padding=1)
207
+ self.conf_conv7 = nn.Conv2d(1024, n_boxes['conv7']*n_classes, kernel_size=3, padding=1)
208
+ self.conf_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2']*n_classes, kernel_size=3, padding=1)
209
+ self.conf_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2']*n_classes, kernel_size=3, padding=1)
210
+ self.conf_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2']*n_classes, kernel_size=3, padding=1)
211
+ self.conf_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2']*n_classes, kernel_size=3, padding=1)
212
+
213
+ def init_conv2d(self):
214
+ """
215
+ Initialize convolution parameters.
216
+ """
217
+ for c in self.children():
218
+ if isinstance(c, nn.Conv2d):
219
+ nn.init.xavier_uniform_(c.weight)
220
+ if c.bias is not None:
221
+ nn.init.constant_(c.bias, 0.)
222
+
223
+
224
+ def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats):
225
+
226
+ batch_size = conv4_3_feats.shape[0]
227
+
228
+
229
+ loc_conv4_3 = self.loc_conv4_3(conv4_3_feats)
230
+ loc_conv4_3 = loc_conv4_3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
231
+
232
+ loc_conv7 = self.loc_conv7(conv7_feats)
233
+ loc_conv7 = loc_conv7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
234
+
235
+ loc_conv8_2 = self.loc_conv8_2(conv8_2_feats)
236
+ loc_conv8_2 = loc_conv8_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
237
+
238
+ loc_conv9_2 = self.loc_conv9_2(conv9_2_feats)
239
+ loc_conv9_2 = loc_conv9_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
240
+
241
+ loc_conv10_2 = self.loc_conv10_2(conv10_2_feats)
242
+ loc_conv10_2 = loc_conv10_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
243
+
244
+ loc_conv11_2 = self.loc_conv11_2(conv11_2_feats)
245
+ loc_conv11_2 = loc_conv11_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
246
+
247
+
248
+ conf_conv4_3 = self.conf_conv4_3(conv4_3_feats)
249
+ conf_conv4_3 = conf_conv4_3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
250
+
251
+ conf_conv7 = self.conf_conv7(conv7_feats)
252
+ conf_conv7 = conf_conv7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
253
+
254
+ conf_conv8_2 = self.conf_conv8_2(conv8_2_feats)
255
+ conf_conv8_2 = conf_conv8_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
256
+
257
+ conf_conv9_2 = self.conf_conv9_2(conv9_2_feats)
258
+ conf_conv9_2 = conf_conv9_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
259
+
260
+ conf_conv10_2 = self.conf_conv10_2(conv10_2_feats)
261
+ conf_conv10_2 = conf_conv10_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
262
+
263
+ conf_conv11_2 = self.conf_conv11_2(conv11_2_feats)
264
+ conf_conv11_2 = conf_conv11_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
265
+
266
+ loc = torch.cat((loc_conv4_3, loc_conv7, loc_conv8_2, loc_conv9_2, loc_conv10_2, loc_conv11_2), dim=1)
267
+ conf = torch.cat((conf_conv4_3, conf_conv7, conf_conv8_2, conf_conv9_2, conf_conv10_2, conf_conv11_2), dim=1)
268
+
269
+ return loc, conf
270
+
271
+ class L2Norm(nn.Module):
272
+ def __init__(self, input_channel=512, scale=20):
273
+ super().__init__()
274
+ self.scale_factors = nn.Parameter(torch.FloatTensor(1, input_channel, 1, 1))
275
+ self.eps = 1e-10
276
+ nn.init.constant_(self.scale_factors, scale)
277
+
278
+ def forward(self, tensor):
279
+ norm = tensor.pow(2).sum(dim=1, keepdim=True).sqrt()
280
+ tensor = tensor/(norm + self.eps)*self.scale_factors
281
+ return tensor
282
+
283
+ class SSD300(nn.Module):
284
+
285
+ def __init__(self, pretrain_path = None, data_train_on = "VOC", n_classes = 21):
286
+ super().__init__()
287
+
288
+ self.n_classes = n_classes
289
+ self.data_train_on = data_train_on
290
+ self.base_net = VGG16Base()
291
+ self.auxi_conv = AuxiliraryConvolutions()
292
+ self.pred_conv = PredictionConvolutions(n_classes)
293
+ self.l2_norm = L2Norm()
294
+
295
+ if pretrain_path is not None:
296
+ self.load_state_dict(torch.load(pretrain_path))
297
+ else:
298
+ self.base_net.load_pretrain()
299
+ self.auxi_conv.init_conv2d()
300
+ self.pred_conv.init_conv2d()
301
+
302
+ def create_prior_boxes(self):
303
+ """
304
+ Tạo 8732 prior boxes (tensor) như trong paper
305
+ mỗi box có dạng [cx, cy, w, h] được scale
306
+ """
307
+ # kích thước feature map tương ứng
308
+ fmap_sizes = [38, 19, 10, 5, 3, 1]
309
+
310
+ # scale như trong paper và được tính sẵn thay vì công thức
311
+ # lưu ý ở conv4_3, tác giả xét như một trường hợp đặc biệt (scale 0.1):
312
+ # Ở mục 3.1, trang 7 :
313
+ # "We set default box with scale 0.1 on conv4 3 .... "
314
+ # "For SSD512 model, we add extra conv12 2 for prediction, set smin to 0.15, and 0.07 on conv4 3...""
315
+
316
+ if self.data_train_on == "VOC":
317
+ box_scales = [0.1, 0.2, 0.375, 0.55, 0.725, 0.9]
318
+ elif self.data_train_on == "COCO":
319
+ box_scales = [0.07, 0.15, 0.3375, 0.525, 0.7125, 0.9]
320
+
321
+ aspect_ratios = [
322
+ [1., 2., 0.5],
323
+ [1., 2., 3., 0.5, 0.333],
324
+ [1., 2., 3., 0.5, 0.333],
325
+ [1., 2., 3., 0.5, 0.333],
326
+ [1., 2., 0.5],
327
+ [1., 2., 0.5]
328
+ ]
329
+ dboxes = []
330
+
331
+
332
+ for idx, fmap_size in enumerate(fmap_sizes):
333
+ for i in range(fmap_size):
334
+ for j in range(fmap_size):
335
+
336
+ # lưu ý, cx trong ảnh là trục hoành, do đó j + 0.5 chứ không phải i + 0.5
337
+ cx = (j + 0.5) / fmap_size
338
+ cy = (i + 0.5) / fmap_size
339
+
340
+ for aspect_ratio in aspect_ratios[idx]:
341
+ scale = box_scales[idx]
342
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
343
+
344
+ if aspect_ratio == 1:
345
+ try:
346
+ scale = sqrt(scale*box_scales[idx + 1])
347
+ except IndexError:
348
+ scale = 1.
349
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
350
+
351
+ dboxes = torch.FloatTensor(dboxes)
352
+
353
+ #dboxes = pascalVOC_style(dboxes)
354
+ dboxes.clamp_(min=0, max=1)
355
+ #dboxes = yolo_style(dboxes)
356
+
357
+ return dboxes
358
+
359
+ def forward(self, images):
360
+ conv4_3_feats, conv7_feats = self.base_net(images)
361
+ conv4_3_feats = self.l2_norm(conv4_3_feats)
362
+ conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.auxi_conv(conv7_feats)
363
+
364
+ loc, conf = self.pred_conv(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
365
+ return loc, conf
366
+
367
+
368
+
SSD512.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.lib import *
2
+
3
+ class VGG16Base(nn.Module):
4
+ """
5
+ Lấy VGG16 làm base network, tuy nhiên cần có một vài thay đổi:
6
+ - Đầu vào ảnh là 512x512 thay vì 224x224, các comment bên dưới sẽ áp dụng cho đầu vào 512x512
7
+ - Lớp pooling thứ 3 sử dụng ceiling mode thay vì floor mode
8
+ - Lớp pooling thứ 5 kernel size (2, 2) -> (3, 3) và stride 2 -> 1, và padding = 1
9
+ - Ta downsample (decimate) parameter fc6 và fc7 để tạo thành conv6 và conv7, loại bỏ hoàn toàn fc8
10
+ """
11
+
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 64, kernel_size=3, padding=1)
16
+ self.conv1_2 = nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size=3, padding=1)
17
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
18
+
19
+ self.conv2_1 = nn.Conv2d(in_channels= 64, out_channels=128, kernel_size=3, padding=1)
20
+ self.conv2_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
21
+ self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
22
+
23
+ self.conv3_1 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
24
+ self.conv3_2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
25
+ self.conv3_3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
26
+ self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
27
+
28
+ self.conv4_1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
29
+ self.conv4_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
30
+ self.conv4_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
31
+ self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
32
+
33
+ self.conv5_1 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
34
+ self.conv5_2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
35
+ self.conv5_3 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
36
+ self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
37
+
38
+ # Không còn fc layers nữa, thay vào đó là conv6 và conv7
39
+ # atrous
40
+ self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6)
41
+ self.conv7 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
42
+
43
+ def decimate(self, tensor, steps):
44
+ assert(len(steps) == tensor.dim())
45
+
46
+ for i in range(tensor.dim()):
47
+ if steps[i] is not None:
48
+ tensor = tensor.index_select(dim=i, index=torch.arange(start=0, end=tensor.shape[i], step=steps[i]))
49
+
50
+ return tensor
51
+
52
+
53
+ def load_pretrain(self):
54
+ """
55
+ load pretrain từ thư viện pytorch, decimate param lại để phù hợp với conv6 và conv7
56
+ """
57
+
58
+ state_dict = self.state_dict()
59
+ param_names = list(state_dict.keys())
60
+
61
+ # old version : torch.vision.models.vgg16(pretrain=True)
62
+ # Load model theo API mới của pytorch, cụ thể hơn tại : https://pytorch.org/vision/stable/models.html
63
+ pretrain_state_dict = torchvision.models.vgg16(weights='VGG16_Weights.DEFAULT').state_dict()
64
+ pretrain_param_names = list(pretrain_state_dict.keys())
65
+
66
+ # Pretrain param name và custom param name không giống nhau, các param chỉ cùng thứ tự như trong architecture
67
+ for idx, param_name in enumerate(param_names[:-4]): # 4 param cuối là weight và bias của conv6 và conv7, sẽ xử lí sau
68
+ state_dict[param_name] = pretrain_state_dict[pretrain_param_names[idx]]
69
+
70
+ # fc -> conv
71
+ fc6_weight = pretrain_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)
72
+ fc6_bias = pretrain_state_dict['classifier.0.bias'].view(4096)
73
+
74
+ fc7_weight = pretrain_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)
75
+ fc7_bias = pretrain_state_dict['classifier.3.bias'].view(4096)
76
+
77
+ # downsample parameter
78
+ state_dict['conv6.weight'] = self.decimate(fc6_weight, steps=[4, None, 3, 3])
79
+ state_dict['conv6.bias'] = self.decimate(fc6_bias, steps=[4])
80
+
81
+ state_dict['conv7.weight'] = self.decimate(fc7_weight, steps=[4, 4, None, None])
82
+ state_dict['conv7.bias'] = self.decimate(fc7_bias, steps=[4])
83
+
84
+ self.load_state_dict(state_dict)
85
+
86
+
87
+ def forward(self, images):
88
+ """
89
+ :param images, tensor [N, 3, 512, 512]
90
+
91
+ return:
92
+ """
93
+ out = F.relu(self.conv1_1(images)) # [N, 64, 512, 512]
94
+ out = F.relu(self.conv1_2(out)) # [N, 64, 512, 512]
95
+ out = self.pool1(out) # [N, 64, 256, 256]
96
+
97
+ out = F.relu(self.conv2_1(out)) # [N, 128, 256, 256]
98
+ out = F.relu(self.conv2_2(out)) # [N, 128, 256, 256]
99
+ out = self.pool2(out) # [N, 128, 128, 128]
100
+
101
+ out = F.relu(self.conv3_1(out)) # [N, 256, 128, 128]
102
+ out = F.relu(self.conv3_2(out)) # [N, 256, 128, 128]
103
+ out = F.relu(self.conv3_3(out)) # [N, 256, 128, 128]
104
+ out = self.pool3(out) # [N, 256, 64, 64]
105
+
106
+ out = F.relu(self.conv4_1(out)) # [N, 512, 64, 64]
107
+ out = F.relu(self.conv4_2(out)) # [N, 512, 64, 64]
108
+ out = F.relu(self.conv4_3(out)) # [N, 512, 64, 64]
109
+ conv4_3_feats = out # [N, 512, 64, 64]
110
+ out = self.pool4(out) # [N, 512, 32, 32]
111
+
112
+ out = F.relu(self.conv5_1(out)) # [N, 512, 32, 32]
113
+ out = F.relu(self.conv5_2(out)) # [N, 512, 32, 32]
114
+ out = F.relu(self.conv5_3(out)) # [N, 512, 32, 32]
115
+ out = self.pool5(out) # [N, 512, 32, 32], layer pooling này không làm thay đổi size features map
116
+
117
+ out = F.relu(self.conv6(out)) # [N, 1024, 32, 32]
118
+
119
+ conv7_feats = F.relu(self.conv7(out)) # [N, 1024, 32, 32]
120
+
121
+ return conv4_3_feats, conv7_feats # [N, 512, 64, 64], [N, 1024, 32, 32]
122
+
123
+
124
+ class AuxiliraryConvolutions(nn.Module):
125
+
126
+ def __init__(self):
127
+ super().__init__()
128
+
129
+ self.conv8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=0)
130
+ self.conv8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
131
+
132
+ self.conv9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=0)
133
+ self.conv9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
134
+
135
+ self.conv10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
136
+ self.conv10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
137
+
138
+ self.conv11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
139
+ self.conv11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
140
+
141
+ self.conv12_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, padding=0)
142
+ self.conv12_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=4, padding=1)
143
+
144
+
145
+ def init_conv2d(self):
146
+ """
147
+ Initialize convolution parameters.
148
+ """
149
+ for c in self.children():
150
+ if isinstance(c, nn.Conv2d):
151
+ nn.init.xavier_uniform_(c.weight)
152
+ if c.bias is not None:
153
+ nn.init.constant_(c.bias, 0.)
154
+
155
+ def forward(self, conv7_feats):
156
+ """
157
+ :param conv8_feats, tensor [N, 1024, 32, 32]
158
+ """
159
+
160
+ out = F.relu(self.conv8_1(conv7_feats)) # [N, 256, 32, 32]
161
+ out = F.relu(self.conv8_2(out)) # [N, 512, 16, 16]
162
+ conv8_2_feats = out # [N, 512, 16, 16]
163
+
164
+ out = F.relu(self.conv9_1(out)) # [N, 128, 16, 16]
165
+ out = F.relu(self.conv9_2(out)) # [N, 256, 8, 8]
166
+ conv9_2_feats = out # [N, 256, 8, 8]
167
+
168
+ out = F.relu(self.conv10_1(out)) # [N, 128, 8, 8]
169
+ out = F.relu(self.conv10_2(out)) # [N, 256, 4, 4]
170
+ conv10_2_feats = out # [N, 256, 4, 4]
171
+
172
+ out = F.relu(self.conv11_1(out)) # [N, 128, 4, 4]
173
+ out = F.relu(self.conv11_2(out)) # [N, 256, 2, 2]
174
+ conv11_2_feats = out
175
+
176
+ out = F.relu(self.conv12_1(out)) # [N, 128, 2, 2]
177
+ out = F.relu(self.conv12_2(out)) # [N, 256, 1, 1]
178
+ conv12_2_feats = out
179
+
180
+ return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats
181
+
182
+
183
+ class PredictionConvolutions(nn.Module):
184
+
185
+ def __init__(self, n_classes=21):
186
+ super().__init__()
187
+
188
+ self.n_classes = n_classes
189
+
190
+ n_boxes={
191
+ 'conv4_3' : 4,
192
+ 'conv7' : 6,
193
+ 'conv8_2' : 6,
194
+ 'conv9_2' : 6,
195
+ 'conv10_2' : 6,
196
+ 'conv11_2' : 4,
197
+ 'conv12_2' : 4
198
+ }
199
+
200
+ # kernel size = 3 và padding = 1 không làm thay đổi kích thước feature map
201
+
202
+ self.loc_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3']*4, kernel_size=3, padding=1)
203
+ self.loc_conv7 = nn.Conv2d(1024, n_boxes['conv7']*4, kernel_size=3, padding=1)
204
+ self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2']*4, kernel_size=3, padding=1)
205
+ self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2']*4, kernel_size=3, padding=1)
206
+ self.loc_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2']*4, kernel_size=3, padding=1)
207
+ self.loc_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2']*4, kernel_size=3, padding=1)
208
+ self.loc_conv12_2 = nn.Conv2d(256, n_boxes['conv12_2']*4, kernel_size=3, padding=1)
209
+
210
+
211
+ self.conf_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3']*n_classes, kernel_size=3, padding=1)
212
+ self.conf_conv7 = nn.Conv2d(1024, n_boxes['conv7']*n_classes, kernel_size=3, padding=1)
213
+ self.conf_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2']*n_classes, kernel_size=3, padding=1)
214
+ self.conf_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2']*n_classes, kernel_size=3, padding=1)
215
+ self.conf_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2']*n_classes, kernel_size=3, padding=1)
216
+ self.conf_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2']*n_classes, kernel_size=3, padding=1)
217
+ self.conf_conv12_2 = nn.Conv2d(256, n_boxes['conv12_2']*n_classes, kernel_size=3, padding=1)
218
+
219
+ def init_conv2d(self):
220
+ """
221
+ Initialize convolution parameters.
222
+ """
223
+ for c in self.children():
224
+ if isinstance(c, nn.Conv2d):
225
+ nn.init.xavier_uniform_(c.weight)
226
+ if c.bias is not None:
227
+ nn.init.constant_(c.bias, 0.)
228
+
229
+
230
+ def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats):
231
+
232
+ batch_size = conv4_3_feats.shape[0]
233
+
234
+
235
+ loc_conv4_3 = self.loc_conv4_3(conv4_3_feats)
236
+ loc_conv4_3 = loc_conv4_3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
237
+
238
+ loc_conv7 = self.loc_conv7(conv7_feats)
239
+ loc_conv7 = loc_conv7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
240
+
241
+ loc_conv8_2 = self.loc_conv8_2(conv8_2_feats)
242
+ loc_conv8_2 = loc_conv8_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
243
+
244
+ loc_conv9_2 = self.loc_conv9_2(conv9_2_feats)
245
+ loc_conv9_2 = loc_conv9_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
246
+
247
+ loc_conv10_2 = self.loc_conv10_2(conv10_2_feats)
248
+ loc_conv10_2 = loc_conv10_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
249
+
250
+ loc_conv11_2 = self.loc_conv11_2(conv11_2_feats)
251
+ loc_conv11_2 = loc_conv11_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
252
+
253
+ loc_conv12_2 = self.loc_conv12_2(conv12_2_feats)
254
+ loc_conv12_2 = loc_conv12_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
255
+
256
+
257
+
258
+ conf_conv4_3 = self.conf_conv4_3(conv4_3_feats)
259
+ conf_conv4_3 = conf_conv4_3.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
260
+
261
+ conf_conv7 = self.conf_conv7(conv7_feats)
262
+ conf_conv7 = conf_conv7.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
263
+
264
+ conf_conv8_2 = self.conf_conv8_2(conv8_2_feats)
265
+ conf_conv8_2 = conf_conv8_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
266
+
267
+ conf_conv9_2 = self.conf_conv9_2(conv9_2_feats)
268
+ conf_conv9_2 = conf_conv9_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
269
+
270
+ conf_conv10_2 = self.conf_conv10_2(conv10_2_feats)
271
+ conf_conv10_2 = conf_conv10_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
272
+
273
+ conf_conv11_2 = self.conf_conv11_2(conv11_2_feats)
274
+ conf_conv11_2 = conf_conv11_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
275
+
276
+ conf_conv12_2 = self.conf_conv12_2(conv12_2_feats)
277
+ conf_conv12_2 = conf_conv12_2.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, self.n_classes)
278
+
279
+ loc = torch.cat((loc_conv4_3, loc_conv7, loc_conv8_2, loc_conv9_2, loc_conv10_2, loc_conv11_2, loc_conv12_2), dim=1)
280
+ conf = torch.cat((conf_conv4_3, conf_conv7, conf_conv8_2, conf_conv9_2, conf_conv10_2, conf_conv11_2, conf_conv12_2), dim=1)
281
+
282
+ return loc, conf
283
+
284
+ class L2Norm(nn.Module):
285
+ def __init__(self, input_channel=512, scale=20):
286
+ super().__init__()
287
+ self.scale_factors = nn.Parameter(torch.FloatTensor(1, input_channel, 1, 1))
288
+ self.eps = 1e-10
289
+ nn.init.constant_(self.scale_factors, scale)
290
+
291
+ def forward(self, tensor):
292
+ norm = tensor.pow(2).sum(dim=1, keepdim=True).sqrt()
293
+ tensor = tensor/(norm + self.eps)*self.scale_factors
294
+ return tensor
295
+
296
+ class SSD512(nn.Module):
297
+
298
+ def __init__(self, pretrain_path = None, data_train_on = "VOC", n_classes = 21):
299
+ super().__init__()
300
+
301
+ self.n_classes = n_classes
302
+ self.data_train_on = data_train_on
303
+ self.base_net = VGG16Base()
304
+ self.auxi_conv = AuxiliraryConvolutions()
305
+ self.pred_conv = PredictionConvolutions(n_classes)
306
+ self.l2_norm = L2Norm()
307
+
308
+ if pretrain_path is not None:
309
+ self.load_state_dict(torch.load(pretrain_path))
310
+ else:
311
+ self.base_net.load_pretrain()
312
+ self.auxi_conv.init_conv2d()
313
+ self.pred_conv.init_conv2d()
314
+
315
+ def create_prior_boxes(self):
316
+ """
317
+ Tạo prior boxes (tensor) như trong paper
318
+ mỗi box có dạng [cx, cy, w, h] được scale
319
+ """
320
+ # kích thước feature map tương ứng
321
+ fmap_sizes = [64, 32, 16, 8, 4, 2, 1]
322
+
323
+ # scale như trong paper và được tính sẵn thay vì công thức
324
+ # lưu ý ở conv4_3, tác giả xét như m��t trường hợp đặc biệt (scale 0.1):
325
+ # Ở mục 3.1, trang 7 :
326
+ # "We set default box with scale 0.1 on conv4 3 .... "
327
+ # "For SSD512 model, we add extra conv12 2 for prediction, set smin to 0.15, and 0.07 on conv4 3...""
328
+
329
+ if self.data_train_on == "VOC":
330
+ box_scales = [0.07, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9]
331
+ elif self.data_train_on == "COCO":
332
+ box_scales = [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9]
333
+
334
+ aspect_ratios = [
335
+ [1., 2., 0.5],
336
+ [1., 2., 3., 0.5, 0.333],
337
+ [1., 2., 3., 0.5, 0.333],
338
+ [1., 2., 3., 0.5, 0.333],
339
+ [1., 2., 3., 0.5, 0.333],
340
+ [1., 2., 0.5],
341
+ [1., 2., 0.5]
342
+ ]
343
+ dboxes = []
344
+
345
+
346
+ for idx, fmap_size in enumerate(fmap_sizes):
347
+ for i in range(fmap_size):
348
+ for j in range(fmap_size):
349
+
350
+ # lưu ý, cx trong ảnh là trục hoành, do đó j + 0.5 chứ không phải i + 0.5
351
+ cx = (j + 0.5) / fmap_size
352
+ cy = (i + 0.5) / fmap_size
353
+
354
+ for aspect_ratio in aspect_ratios[idx]:
355
+ scale = box_scales[idx]
356
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
357
+
358
+ if aspect_ratio == 1.:
359
+ try:
360
+ scale = sqrt(scale*box_scales[idx + 1])
361
+ except IndexError:
362
+ scale = 1.
363
+ dboxes.append([cx, cy, scale*sqrt(aspect_ratio), scale/sqrt(aspect_ratio)])
364
+
365
+ dboxes = torch.FloatTensor(dboxes)
366
+
367
+ #dboxes = pascalVOC_style(dboxes)
368
+ dboxes.clamp_(0, 1)
369
+ #dboxes = yolo_style(dboxes)
370
+
371
+ return dboxes
372
+
373
+ def forward(self, images):
374
+ conv4_3_feats, conv7_feats = self.base_net(images)
375
+ conv4_3_feats = self.l2_norm(conv4_3_feats)
376
+ conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats = self.auxi_conv(conv7_feats)
377
+
378
+ loc, conf = self.pred_conv(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats, conv12_2_feats)
379
+ return loc, conf
380
+
381
+
382
+
383
+ if __name__ == "__main__":
384
+ T = SSD512()
385
+ imgs = torch.Tensor(1, 3, 512, 512)
386
+ loc, conf = T(imgs)
387
+ print(loc.shape)
388
+ print(conf.shape)
389
+
390
+
iteration_118000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3be7c9bb7482a96847f489afe937753b907b213841dbe3f4c7417c697bc97d19
3
+ size 113201355
iteration_120000_FPNSSD300_78.01.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:364ff90e18d3a18b39bfd9f7c12f917ac491ecb38d1875b23becfbf3cbc4fc27
3
+ size 110460694
iteration_120000_SSD300.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e11ecdf15827df398d5dd04b9a5a79372f0126a292f5cb2640d5b482d3a59a
3
+ size 105166689
iteration_120000_SSD300_77.2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd82f52cac67f61d9c7ab5b2fc496ad6d107eb35c3c493a353d3240fe1b610d
3
+ size 105166689
iteration_120000_a_78.27.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:993c656917cf99ec7a49dc5cc81bcc846f87ad4ccc193174f4d76cbeea1cc632
3
+ size 110460694
iteration_120000_b_78.29.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a5f257f8964206b68842b5b02d51d1873e9e7eb03e4527ec11cdbc13349d11c
3
+ size 113201217
iteration_120000_c_78.01.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:364ff90e18d3a18b39bfd9f7c12f917ac491ecb38d1875b23becfbf3cbc4fc27
3
+ size 110460694