Darknsu commited on
Commit
e1fb045
·
verified ·
1 Parent(s): a53e7ee

Upload 25 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/Poppins[[:space:]]Black[[:space:]]Italic[[:space:]]900.ttf filter=lfs diff=lfs merge=lfs -text
37
+ data/Poppins[[:space:]]ExtraBold[[:space:]]Italic[[:space:]]800.ttf filter=lfs diff=lfs merge=lfs -text
Evaluation/__pycache__/eval_detection_gentime.cpython-310.pyc ADDED
Binary file (7.8 kB). View file
 
Evaluation/__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.49 kB). View file
 
Evaluation/eval_detection_gentime.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ #import urllib.request, urllib.error, urllib.parse
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from utils import get_blocked_videos
8
+ from utils import interpolated_prec_rec
9
+ from utils import segment_iou
10
+
11
+ class ANETdetection(object):
12
+
13
+ GROUND_TRUTH_FIELDS = ['database']#, 'taxonomy', 'version']
14
+ PREDICTION_FIELDS = ['results', 'version', 'external_data']
15
+
16
+ def __init__(self, opt, ground_truth_filename=None, prediction_filename=None,
17
+ ground_truth_fields=GROUND_TRUTH_FIELDS,
18
+ prediction_fields=PREDICTION_FIELDS,
19
+ tiou_thresholds=np.linspace(0.5, 0.95, 10),
20
+ subset='validation', verbose=False,
21
+ check_status=True):
22
+ if not ground_truth_filename:
23
+ raise IOError('Please input a valid ground truth file.')
24
+ if not prediction_filename:
25
+ raise IOError('Please input a valid prediction file.')
26
+ self.subset = subset
27
+ self.tiou_thresholds = tiou_thresholds
28
+ self.verbose = verbose
29
+ self.gt_fields = ground_truth_fields
30
+ self.pred_fields = prediction_fields
31
+ self.ap = None
32
+ self.tdiff = None
33
+ self.check_status = check_status
34
+ self.num_class = opt["num_of_class"]
35
+ # Retrieve blocked videos from server.
36
+ if self.check_status:
37
+ self.blocked_videos = get_blocked_videos()
38
+ else:
39
+ self.blocked_videos = list()
40
+ # Import ground truth and predictions.
41
+ self.ground_truth, self.activity_index, cidx = self._import_ground_truth(
42
+ ground_truth_filename)
43
+ self.prediction = self._import_prediction(prediction_filename, cidx)
44
+
45
+ if self.verbose:
46
+ print('[INIT] Loaded annotations from {} subset.'.format(subset))
47
+ nr_gt = len(self.ground_truth)
48
+ print('\tNumber of ground truth instances: {}'.format(nr_gt))
49
+ nr_pred = len(self.prediction)
50
+ print('\tNumber of predictions: {}'.format(nr_pred))
51
+ print('\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds))
52
+
53
+ def _import_ground_truth(self, ground_truth_filename):
54
+ """Reads ground truth file, checks if it is well formatted, and returns
55
+ the ground truth instances and the activity classes.
56
+
57
+ Parameters
58
+ ----------
59
+ ground_truth_filename : str
60
+ Full path to the ground truth json file.
61
+
62
+ Outputs
63
+ -------
64
+ ground_truth : df
65
+ Data frame containing the ground truth instances.
66
+ activity_index : dict
67
+ Dictionary containing class index.
68
+ """
69
+ with open(ground_truth_filename, 'r') as fobj:
70
+ data = json.load(fobj)
71
+ # Checking format
72
+ if not all([field in list(data.keys()) for field in self.gt_fields]):
73
+ raise IOError('Please input a valid ground truth file.')
74
+
75
+ # Read ground truth data.
76
+ activity_index, cidx = {}, 0
77
+
78
+ video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
79
+ for videoid, v in data['database'].items():
80
+ if self.subset not in v['subset']:
81
+ continue
82
+
83
+ for ann in v['annotations']:
84
+ if ann['label'] not in activity_index:
85
+ activity_index[ann['label']] = cidx
86
+ cidx += 1
87
+ video_lst.append(videoid)
88
+ t_start_lst.append(ann['segment'][0])
89
+ t_end_lst.append(ann['segment'][1])
90
+ label_lst.append(activity_index[ann['label']])
91
+
92
+ ground_truth = pd.DataFrame({'video-id': video_lst,
93
+ 't-start': t_start_lst,
94
+ 't-end': t_end_lst,
95
+ 'label': label_lst})
96
+
97
+ return ground_truth, activity_index, cidx
98
+
99
+ def _import_prediction(self, prediction_filename, cidx):
100
+ """Reads prediction file, checks if it is well formatted, and returns
101
+ the prediction instances.
102
+
103
+ Parameters
104
+ ----------
105
+ prediction_filename : str
106
+ Full path to the prediction json file.
107
+
108
+ Outputs
109
+ -------
110
+ prediction : df
111
+ Data frame containing the prediction instances.
112
+ """
113
+ with open(prediction_filename, 'r') as fobj:
114
+ data = json.load(fobj)
115
+ # Checking format...
116
+ if not all([field in list(data.keys()) for field in self.pred_fields]):
117
+ raise IOError('Please input a valid prediction file.')
118
+
119
+ # Read predicitons.
120
+ video_lst, t_start_lst, t_end_lst = [], [], []
121
+ label_lst, score_lst = [], []
122
+ gentime_lst = []
123
+ for videoid, v in data['results'].items():
124
+ if videoid in self.blocked_videos:
125
+ continue
126
+ for result in v:
127
+ if result['label'] not in self.activity_index.keys():
128
+ continue
129
+
130
+ label = self.activity_index[result['label']]
131
+ video_lst.append(videoid)
132
+ t_start_lst.append(result['segment'][0])
133
+ t_end_lst.append(result['segment'][1])
134
+ label_lst.append(label)
135
+ score_lst.append(result['score'])
136
+ gentime_lst.append(result['gentime'])
137
+
138
+ prediction = pd.DataFrame({'video-id': video_lst,
139
+ 't-start': t_start_lst,
140
+ 't-end': t_end_lst,
141
+ 'label': label_lst,
142
+ 'score': score_lst,
143
+ 'gentime': gentime_lst})
144
+ return prediction
145
+
146
+ def wrapper_compute_average_precision(self):
147
+ """Computes average precision for each class in the subset.
148
+ """
149
+ ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
150
+ tdiff = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
151
+ cnt_tp = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
152
+ for activity, cidx in self.activity_index.items():
153
+ gt_idx = self.ground_truth['label'] == cidx
154
+ pred_idx = self.prediction['label'] == cidx
155
+ ap[:,cidx], tdiff[:,cidx], cnt_tp[:,cidx] = compute_average_precision_detection(
156
+ self.ground_truth.loc[gt_idx].reset_index(drop=True),
157
+ self.prediction.loc[pred_idx].reset_index(drop=True),
158
+ tiou_thresholds=self.tiou_thresholds)
159
+
160
+ sum_tdiff = np.sum(tdiff, axis=1)
161
+ total_tp = np.sum(cnt_tp, axis=1)
162
+ final_tdiff = sum_tdiff/total_tp
163
+
164
+ return ap, final_tdiff
165
+
166
+ def evaluate(self):
167
+ """Evaluates a prediction file. For the detection task we measure the
168
+ interpolated mean average precision to measure the performance of a
169
+ method.
170
+ """
171
+ self.ap, self.tdiff = self.wrapper_compute_average_precision()
172
+ self.mAP = self.ap.mean(axis=1)
173
+ if self.verbose:
174
+ print('[RESULTS] Performance on ActivityNet detection task.')
175
+ print('\tAverage-mAP: {}'.format(self.mAP.mean()))
176
+ print('\tAverage-time diff: {}'.format(self.tdiff.mean()))
177
+
178
+ def compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
179
+ """Compute average precision (detection task) between ground truth and
180
+ predictions data frames. If multiple predictions occurs for the same
181
+ predicted segment, only the one with highest score is matches as
182
+ true positive. This code is greatly inspired by Pascal VOC devkit.
183
+
184
+ Parameters
185
+ ----------
186
+ ground_truth : df
187
+ Data frame containing the ground truth instances.
188
+ Required fields: ['video-id', 't-start', 't-end']
189
+ prediction : df
190
+ Data frame containing the prediction instances.
191
+ Required fields: ['video-id, 't-start', 't-end', 'score']
192
+ tiou_thresholds : 1darray, optional
193
+ Temporal intersection over union threshold.
194
+
195
+ Outputs
196
+ -------
197
+ ap : float
198
+ Average precision score.
199
+ """
200
+ npos = float(len(ground_truth))
201
+ lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1
202
+
203
+ # Sort predictions by decreasing score order.
204
+ sort_idx = prediction['score'].values.argsort()[::-1]
205
+ prediction = prediction.loc[sort_idx].reset_index(drop=True)
206
+
207
+ # Initialize true positive and false positive vectors.
208
+ tp = np.zeros((len(tiou_thresholds), len(prediction)))
209
+ fp = np.zeros((len(tiou_thresholds), len(prediction)))
210
+ timediff = np.zeros((len(tiou_thresholds), len(prediction)))
211
+
212
+ # Adaptation to query faster
213
+ ground_truth_gbvn = ground_truth.groupby('video-id')
214
+
215
+ # Assigning true positive to truly grount truth instances.
216
+ for idx, this_pred in prediction.iterrows():
217
+
218
+ try:
219
+ # Check if there is at least one ground truth in the video associated.
220
+ ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id'])
221
+ except Exception as e:
222
+ fp[:, idx] = 1
223
+ continue
224
+
225
+ this_gt = ground_truth_videoid.reset_index()
226
+ tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values,
227
+ this_gt[['t-start', 't-end']].values)
228
+ gentime_pred_arr= this_pred['gentime']
229
+ gentime_gt_arr = this_gt['t-end'].values
230
+ tiou_sorted_idx = tiou_arr.argsort()[::-1]
231
+ for tidx, tiou_thr in enumerate(tiou_thresholds):
232
+ for jdx in tiou_sorted_idx:
233
+ if tiou_arr[jdx] < tiou_thr:
234
+ fp[tidx, idx] = 1
235
+ break
236
+ if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0:
237
+ continue
238
+ # Assign as true positive after the filters above.
239
+ tp[tidx, idx] = 1
240
+ timediff[tidx, idx]=(gentime_pred_arr-gentime_gt_arr[jdx])#/len_gt_arr[jdx]
241
+ lock_gt[tidx, this_gt.loc[jdx]['index']] = idx
242
+ break
243
+
244
+ if fp[tidx, idx] == 0 and tp[tidx, idx] == 0:
245
+ fp[tidx, idx] = 1
246
+
247
+ ap = np.zeros(len(tiou_thresholds))
248
+ tdiff = np.zeros(len(tiou_thresholds))
249
+ cnt_tp = np.zeros(len(tiou_thresholds))
250
+
251
+ for tidx in range(len(tiou_thresholds)):
252
+ # Computing prec-rec
253
+ this_tp = np.cumsum(tp[tidx,:]).astype(float)
254
+ this_fp = np.cumsum(fp[tidx,:]).astype(float)
255
+
256
+ # print(this_tp, npos)
257
+ rec = this_tp / npos
258
+ prec = this_tp / (this_tp + this_fp)
259
+ # print('###', rec, prec)
260
+ ap[tidx] = interpolated_prec_rec(prec, rec)
261
+ this_tdiff=np.cumsum(timediff[tidx,:]).astype(float)
262
+ if len(this_tdiff)==0:
263
+ continue
264
+ tdiff[tidx]=this_tdiff[-1]# / max(1,this_tp[-1])
265
+ cnt_tp[tidx]=this_tp[-1]
266
+
267
+ return ap,tdiff, cnt_tp
268
+
269
+
270
+
271
+
272
+
273
+ # import json
274
+ # #import urllib.request, urllib.error, urllib.parse
275
+
276
+ # import numpy as np
277
+ # import pandas as pd
278
+
279
+ # from utils import get_blocked_videos
280
+ # from utils import interpolated_prec_rec
281
+ # from utils import segment_iou
282
+
283
+ # class ANETdetection(object):
284
+
285
+ # GROUND_TRUTH_FIELDS = ['database']#, 'taxonomy', 'version']
286
+ # PREDICTION_FIELDS = ['results', 'version', 'external_data']
287
+
288
+ # def __init__(self, opt, ground_truth_filename=None, prediction_filename=None,
289
+ # ground_truth_fields=GROUND_TRUTH_FIELDS,
290
+ # prediction_fields=PREDICTION_FIELDS,
291
+ # tiou_thresholds=np.linspace(0.5, 0.95, 10),
292
+ # subset='validation', verbose=False,
293
+ # check_status=True):
294
+ # if not ground_truth_filename:
295
+ # raise IOError('Please input a valid ground truth file.')
296
+ # if not prediction_filename:
297
+ # raise IOError('Please input a valid prediction file.')
298
+ # self.subset = subset
299
+ # self.tiou_thresholds = tiou_thresholds
300
+ # self.verbose = verbose
301
+ # self.gt_fields = ground_truth_fields
302
+ # self.pred_fields = prediction_fields
303
+ # self.ap = None
304
+ # self.tdiff = None
305
+ # self.check_status = check_status
306
+ # self.num_class = opt["num_of_class"]
307
+ # # Retrieve blocked videos from server.
308
+ # if self.check_status:
309
+ # self.blocked_videos = get_blocked_videos()
310
+ # else:
311
+ # self.blocked_videos = list()
312
+ # # Import ground truth and predictions.
313
+ # self.ground_truth, self.activity_index, cidx = self._import_ground_truth(
314
+ # ground_truth_filename)
315
+ # self.prediction = self._import_prediction(prediction_filename, cidx)
316
+
317
+ # if self.verbose:
318
+ # print('[INIT] Loaded annotations from {} subset.'.format(subset))
319
+ # nr_gt = len(self.ground_truth)
320
+ # print('\tNumber of ground truth instances: {}'.format(nr_gt))
321
+ # nr_pred = len(self.prediction)
322
+ # print('\tNumber of predictions: {}'.format(nr_pred))
323
+ # print('\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds))
324
+
325
+ # def _import_ground_truth(self, ground_truth_filename):
326
+ # """Reads ground truth file, checks if it is well formatted, and returns
327
+ # the ground truth instances and the activity classes.
328
+
329
+ # Parameters
330
+ # ----------
331
+ # ground_truth_filename : str
332
+ # Full path to the ground truth json file.
333
+
334
+ # Outputs
335
+ # -------
336
+ # ground_truth : df
337
+ # Data frame containing the ground truth instances.
338
+ # activity_index : dict
339
+ # Dictionary containing class index.
340
+ # """
341
+ # with open(ground_truth_filename, 'r') as fobj:
342
+ # data = json.load(fobj)
343
+ # # Checking format
344
+ # if not all([field in list(data.keys()) for field in self.gt_fields]):
345
+ # raise IOError('Please input a valid ground truth file.')
346
+
347
+ # # Read ground truth data.
348
+ # activity_index, cidx = {}, 0
349
+
350
+ # video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
351
+ # for videoid, v in data['database'].items():
352
+ # if self.subset not in v['subset']:
353
+ # continue
354
+
355
+ # for ann in v['annotations']:
356
+ # if ann['label'] not in activity_index:
357
+ # activity_index[ann['label']] = cidx
358
+ # cidx += 1
359
+ # video_lst.append(videoid)
360
+ # t_start_lst.append(ann['segment'][0])
361
+ # t_end_lst.append(ann['segment'][1])
362
+ # label_lst.append(activity_index[ann['label']])
363
+
364
+ # ground_truth = pd.DataFrame({'video-id': video_lst,
365
+ # 't-start': t_start_lst,
366
+ # 't-end': t_end_lst,
367
+ # 'label': label_lst})
368
+
369
+ # return ground_truth, activity_index, cidx
370
+
371
+ # def _import_prediction(self, prediction_filename, cidx):
372
+ # """Reads prediction file, checks if it is well formatted, and returns
373
+ # the prediction instances.
374
+
375
+ # Parameters
376
+ # ----------
377
+ # prediction_filename : str
378
+ # Full path to the prediction json file.
379
+
380
+ # Outputs
381
+ # -------
382
+ # prediction : df
383
+ # Data frame containing the prediction instances.
384
+ # """
385
+ # with open(prediction_filename, 'r') as fobj:
386
+ # data = json.load(fobj)
387
+ # # Checking format...
388
+ # if not all([field in list(data.keys()) for field in self.pred_fields]):
389
+ # raise IOError('Please input a valid prediction file.')
390
+
391
+ # # Read predicitons.
392
+ # video_lst, t_start_lst, t_end_lst = [], [], []
393
+ # label_lst, score_lst = [], []
394
+ # gentime_lst = []
395
+ # for videoid, v in data['results'].items():
396
+ # if videoid in self.blocked_videos:
397
+ # continue
398
+ # for result in v:
399
+ # if result['label'] not in self.activity_index.keys():
400
+ # continue
401
+
402
+ # label = self.activity_index[result['label']]
403
+ # video_lst.append(videoid)
404
+ # t_start_lst.append(result['segment'][0])
405
+ # t_end_lst.append(result['segment'][1])
406
+ # label_lst.append(label)
407
+ # score_lst.append(result['score'])
408
+ # gentime_lst.append(result['gentime'])
409
+
410
+ # prediction = pd.DataFrame({'video-id': video_lst,
411
+ # 't-start': t_start_lst,
412
+ # 't-end': t_end_lst,
413
+ # 'label': label_lst,
414
+ # 'score': score_lst,
415
+ # 'gentime': gentime_lst})
416
+ # return prediction
417
+
418
+ # def wrapper_compute_average_precision(self):
419
+ # """Computes average precision for each class in the subset.
420
+ # """
421
+ # ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
422
+ # tdiff = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
423
+ # cnt_tp = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
424
+
425
+ # for activity, cidx in self.activity_index.items():
426
+ # gt_idx = self.ground_truth['label'] == cidx
427
+ # pred_idx = self.prediction['label'] == cidx
428
+ # ap[:,cidx], tdiff[:,cidx], cnt_tp[:,cidx] = compute_average_precision_detection(
429
+ # self.ground_truth.loc[gt_idx].reset_index(drop=True),
430
+ # self.prediction.loc[pred_idx].reset_index(drop=True),
431
+ # tiou_thresholds=self.tiou_thresholds)
432
+
433
+ # sum_tdiff = np.sum(tdiff, axis=1)
434
+ # total_tp = np.sum(cnt_tp, axis=1)
435
+
436
+ # # FIX: Handle division by zero
437
+ # final_tdiff = np.zeros_like(total_tp)
438
+ # valid_mask = total_tp > 0
439
+ # final_tdiff[valid_mask] = sum_tdiff[valid_mask] / total_tp[valid_mask]
440
+ # # For cases where total_tp is 0, keep final_tdiff as 0
441
+
442
+ # return ap, final_tdiff
443
+
444
+ # def evaluate(self):
445
+ # """Evaluates a prediction file. For the detection task we measure the
446
+ # interpolated mean average precision to measure the performance of a
447
+ # method.
448
+ # """
449
+ # self.ap, self.tdiff = self.wrapper_compute_average_precision()
450
+ # self.mAP = self.ap.mean(axis=1)
451
+ # if self.verbose:
452
+ # print('[RESULTS] Performance on ActivityNet detection task.')
453
+ # print('\tAverage-mAP: {}'.format(self.mAP.mean()))
454
+ # print('\tAverage-time diff: {}'.format(self.tdiff.mean()))
455
+
456
+ # def compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
457
+ # """Compute average precision (detection task) between ground truth and
458
+ # predictions data frames. If multiple predictions occurs for the same
459
+ # predicted segment, only the one with highest score is matches as
460
+ # true positive. This code is greatly inspired by Pascal VOC devkit.
461
+
462
+ # Parameters
463
+ # ----------
464
+ # ground_truth : df
465
+ # Data frame containing the ground truth instances.
466
+ # Required fields: ['video-id', 't-start', 't-end']
467
+ # prediction : df
468
+ # Data frame containing the prediction instances.
469
+ # Required fields: ['video-id, 't-start', 't-end', 'score']
470
+ # tiou_thresholds : 1darray, optional
471
+ # Temporal intersection over union threshold.
472
+
473
+ # Outputs
474
+ # -------
475
+ # ap : float
476
+ # Average precision score.
477
+ # """
478
+ # npos = float(len(ground_truth))
479
+ # lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1
480
+
481
+ # # Sort predictions by decreasing score order.
482
+ # sort_idx = prediction['score'].values.argsort()[::-1]
483
+ # prediction = prediction.loc[sort_idx].reset_index(drop=True)
484
+
485
+ # # Initialize true positive and false positive vectors.
486
+ # tp = np.zeros((len(tiou_thresholds), len(prediction)))
487
+ # fp = np.zeros((len(tiou_thresholds), len(prediction)))
488
+ # timediff = np.zeros((len(tiou_thresholds), len(prediction)))
489
+
490
+ # # Adaptation to query faster
491
+ # ground_truth_gbvn = ground_truth.groupby('video-id')
492
+
493
+ # # Assigning true positive to truly grount truth instances.
494
+ # for idx, this_pred in prediction.iterrows():
495
+
496
+ # try:
497
+ # # Check if there is at least one ground truth in the video associated.
498
+ # ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id'])
499
+ # except Exception as e:
500
+ # fp[:, idx] = 1
501
+ # continue
502
+
503
+ # this_gt = ground_truth_videoid.reset_index()
504
+ # tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values,
505
+ # this_gt[['t-start', 't-end']].values)
506
+ # gentime_pred_arr= this_pred['gentime']
507
+ # gentime_gt_arr = this_gt['t-end'].values
508
+ # tiou_sorted_idx = tiou_arr.argsort()[::-1]
509
+ # for tidx, tiou_thr in enumerate(tiou_thresholds):
510
+ # for jdx in tiou_sorted_idx:
511
+ # if tiou_arr[jdx] < tiou_thr:
512
+ # fp[tidx, idx] = 1
513
+ # break
514
+ # if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0:
515
+ # continue
516
+ # # Assign as true positive after the filters above.
517
+ # tp[tidx, idx] = 1
518
+
519
+ # # FIX: Add safety check for NaN/Inf values
520
+ # time_diff = gentime_pred_arr - gentime_gt_arr[jdx]
521
+ # if np.isfinite(time_diff):
522
+ # timediff[tidx, idx] = time_diff
523
+ # else:
524
+ # timediff[tidx, idx] = 0.0 # Default value for invalid time differences
525
+
526
+ # lock_gt[tidx, this_gt.loc[jdx]['index']] = idx
527
+ # break
528
+
529
+ # if fp[tidx, idx] == 0 and tp[tidx, idx] == 0:
530
+ # fp[tidx, idx] = 1
531
+
532
+ # ap = np.zeros(len(tiou_thresholds))
533
+ # tdiff = np.zeros(len(tiou_thresholds))
534
+ # cnt_tp = np.zeros(len(tiou_thresholds))
535
+
536
+ # for tidx in range(len(tiou_thresholds)):
537
+ # # Computing prec-rec
538
+ # this_tp = np.cumsum(tp[tidx,:]).astype(float)
539
+ # this_fp = np.cumsum(fp[tidx,:]).astype(float)
540
+
541
+ # # Handle edge cases
542
+ # if npos == 0:
543
+ # ap[tidx] = 0.0
544
+ # tdiff[tidx] = 0.0
545
+ # cnt_tp[tidx] = 0.0
546
+ # continue
547
+
548
+ # rec = this_tp / npos
549
+
550
+ # # FIX: Handle division by zero in precision calculation
551
+ # denominator = this_tp + this_fp
552
+ # prec = np.zeros_like(this_tp)
553
+ # valid_mask = denominator > 0
554
+ # prec[valid_mask] = this_tp[valid_mask] / denominator[valid_mask]
555
+
556
+ # ap[tidx] = interpolated_prec_rec(prec, rec)
557
+
558
+ # # FIX: Handle time difference calculation more safely
559
+ # this_tdiff = np.cumsum(timediff[tidx,:]).astype(float)
560
+ # if len(this_tdiff) == 0 or this_tp[-1] == 0:
561
+ # tdiff[tidx] = 0.0
562
+ # else:
563
+ # tdiff[tidx] = this_tdiff[-1]
564
+ # cnt_tp[tidx] = this_tp[-1] if len(this_tp) > 0 else 0.0
565
+
566
+ # return ap, tdiff, cnt_tp
Evaluation/utils.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ #import urllib.request, urllib.error, urllib.parse
3
+
4
+ import numpy as np
5
+
6
+ API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'
7
+
8
+ def get_blocked_videos(api=API):
9
+ # api_url = '{}?action=get_blocked'.format(api)
10
+ # req = urllib.request.Request(api_url)
11
+ # response = urllib.request.urlopen(req)
12
+ # return json.loads(response.read())
13
+ return list()
14
+
15
+ def interpolated_prec_rec(prec, rec):
16
+ """Interpolated AP - VOCdevkit from VOC 2011.
17
+ """
18
+ mprec = np.hstack([[0], prec, [0]])
19
+ mrec = np.hstack([[0], rec, [1]])
20
+ for i in range(len(mprec) - 1)[::-1]:
21
+ mprec[i] = max(mprec[i], mprec[i + 1])
22
+ idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1
23
+ ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprec[idx])
24
+ return ap
25
+
26
+ def segment_iou(target_segment, candidate_segments):
27
+ """Compute the temporal intersection over union between a
28
+ target segment and all the test segments.
29
+
30
+ Parameters
31
+ ----------
32
+ target_segment : 1d array
33
+ Temporal target segment containing [starting, ending] times.
34
+ candidate_segments : 2d array
35
+ Temporal candidate segments containing N x [starting, ending] times.
36
+
37
+ Outputs
38
+ -------
39
+ tiou : 1d array
40
+ Temporal intersection over union score of the N's candidate segments.
41
+ """
42
+ tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
43
+ tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
44
+ # Intersection including Non-negative overlap score.
45
+ segments_intersection = (tt2 - tt1).clip(0)
46
+ # Segment union.
47
+ segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
48
+ + (target_segment[1] - target_segment[0]) - segments_intersection
49
+ # Compute overlap as the ratio of the intersection
50
+ # over union of two segments.
51
+ tIoU = segments_intersection.astype(float) / segments_union
52
+ return tIoU
53
+
54
+ def wrapper_segment_iou(target_segments, candidate_segments):
55
+ """Compute intersection over union btw segments
56
+ Parameters
57
+ ----------
58
+ target_segments : ndarray
59
+ 2-dim array in format [m x 2:=[init, end]]
60
+ candidate_segments : ndarray
61
+ 2-dim array in format [n x 2:=[init, end]]
62
+ Outputs
63
+ -------
64
+ tiou : ndarray
65
+ 2-dim array [n x m] with IOU ratio.
66
+ Note: It assumes that candidate-segments are more scarce that target-segments
67
+ """
68
+ if candidate_segments.ndim != 2 or target_segments.ndim != 2:
69
+ raise ValueError('Dimension of arguments is incorrect')
70
+
71
+ n, m = candidate_segments.shape[0], target_segments.shape[0]
72
+ tiou = np.empty((n, m))
73
+ for i in range(m):
74
+ tiou[:, i] = segment_iou(target_segments[i,:], candidate_segments)
75
+
76
+ return tiou
checkpoint/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Please put the model files in this folder.
data/Poppins Black Italic 900.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d56d2b8ff884cfae1b637e73a71f3caf1d16cdb5b4acc123d9cd0b5864ca2567
3
+ size 156916
data/Poppins ExtraBold Italic 800.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db8f803d5aaf8e646fd868d0a897ed9997985b88c931bfae3e08c7c8dc2556be
3
+ size 158896
data/egtea_annotations_split1.json ADDED
The diff for this file is too large to render. See raw diff
 
data/egtea_annotations_split2.json ADDED
The diff for this file is too large to render. See raw diff
 
data/egtea_annotations_split3.json ADDED
The diff for this file is too large to render. See raw diff
 
data/egtea_annotations_split4.json ADDED
The diff for this file is too large to render. See raw diff
 
data/test_video_annotations.json ADDED
The diff for this file is too large to render. See raw diff
 
data/thumos14_v2.json ADDED
The diff for this file is too large to render. See raw diff
 
data/thumos14_v2_small.json ADDED
The diff for this file is too large to render. See raw diff
 
dataset.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import h5py
3
+ import json
4
+ import torch
5
+ import torch.utils.data as data
6
+ import os
7
+ import pickle
8
+ from multiprocessing import Pool
9
+
10
+ def load_json(file):
11
+ with open(file) as json_file:
12
+ data = json.load(json_file)
13
+ return data
14
+
15
+ def calc_iou(a, b):
16
+ st = a[0] - a[1]
17
+ ed = a[0]
18
+ target_st = b[0] - b[1]
19
+ target_ed = b[0]
20
+ sst = min(st, target_st)
21
+ led = max(ed, target_ed)
22
+ lst = max(st, target_st)
23
+ sed = min(ed, target_ed)
24
+ iou = (sed - lst) / max(led - sst, 1)
25
+ return iou
26
+
27
+ def box_include(y, target):
28
+ st = y[0] - y[1]
29
+ ed = y[0]
30
+ target_st = target[0] - target[1]
31
+ target_ed = target[0]
32
+ detection_point = target_st
33
+ if ed > detection_point and target_st < st and target_ed > ed:
34
+ return True
35
+ return False
36
+
37
+ class VideoDataSet(data.Dataset):
38
+ def __init__(self, opt, subset="train", video_name=None):
39
+ self.subset = subset
40
+ self.mode = opt["mode"]
41
+ self.predefined_fps = opt["predefined_fps"]
42
+ self.video_anno_path = opt["video_anno"].format(opt["split"])
43
+ self.video_len_path = opt["video_len_file"].format(self.subset + '_' + opt["setup"])
44
+ self.num_of_class = opt["num_of_class"]
45
+ self.segment_size = opt["segment_size"]
46
+ self.label_name = []
47
+ self.match_score = {}
48
+ self.match_score_end = {}
49
+ self.match_length = {}
50
+ self.gt_action = {}
51
+ self.cls_label = {}
52
+ self.reg_label = {}
53
+ self.snip_label = {}
54
+ self.inputs = []
55
+ self.inputs_all = []
56
+ self.data_rescale = opt["data_rescale"]
57
+ self.anchors = opt["anchors"]
58
+ self.pos_threshold = opt["pos_threshold"]
59
+ self.single_video_name = video_name
60
+
61
+ self._getDatasetDict()
62
+ self._loadFeaturelen(opt)
63
+ self._getMatchScore()
64
+ self._makeInputSeq()
65
+ self._loadPropLabel(opt['proposal_label_file'].format(self.subset + '_' + opt["setup"]))
66
+
67
+ if self.subset == "train":
68
+ if opt['data_format'] == "h5":
69
+ feature_rgb_file = h5py.File(opt["video_feature_rgb_train"], 'r')
70
+ self.feature_rgb_file = {}
71
+ keys = self.video_list
72
+ for vidx in range(len(keys)):
73
+ if keys[vidx] not in feature_rgb_file:
74
+ raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_train']}")
75
+ self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
76
+ if opt['rgb_only']:
77
+ self.feature_flow_file = None
78
+ else:
79
+ self.feature_flow_file = {}
80
+ feature_flow_file = h5py.File(opt["video_feature_flow_train"], 'r')
81
+ for vidx in range(len(keys)):
82
+ if keys[vidx] not in feature_flow_file:
83
+ raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_train']}")
84
+ self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
85
+ elif opt['data_format'] == "pickle":
86
+ feature_All = pickle.load(open(opt["video_feature_all_train"], 'rb'))
87
+ self.feature_rgb_file = {}
88
+ self.feature_flow_file = {}
89
+ keys = self.video_list
90
+ for vidx in range(len(keys)):
91
+ if keys[vidx] not in feature_All:
92
+ raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_train']}")
93
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
94
+ self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
95
+ elif opt['data_format'] == "npz":
96
+ feature_All = {}
97
+ self.feature_rgb_file = {}
98
+ self.feature_flow_file = {}
99
+ for file in self.video_list:
100
+ feature_path = opt["video_feature_all_train"] + file + '.npz'
101
+ if not os.path.exists(feature_path):
102
+ raise ValueError(f"Feature file {feature_path} not found")
103
+ feature_All[file] = np.load(feature_path)['feats']
104
+ keys = self.video_list
105
+ for vidx in range(len(keys)):
106
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
107
+ self.feature_flow_file = None
108
+ elif opt['data_format'] == "npz_i3d":
109
+ feature_All = {}
110
+ self.feature_rgb_file = {}
111
+ self.feature_flow_file = {}
112
+ for file in self.video_list:
113
+ feature_path = opt["video_feature_all_train"] + file + '.npz'
114
+ if not os.path.exists(feature_path):
115
+ raise ValueError(f"Feature file {feature_path} not found")
116
+ feature_All[file] = np.load(feature_path)
117
+ keys = self.video_list
118
+ for vidx in range(len(keys)):
119
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
120
+ self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
121
+ elif opt['data_format'] == "pt":
122
+ feature_All = {}
123
+ self.feature_rgb_file = {}
124
+ self.feature_flow_file = {}
125
+ for file in self.video_list:
126
+ feature_path = opt["video_feature_all_train"] + file + '.pt'
127
+ if not os.path.exists(feature_path):
128
+ raise ValueError(f"Feature file {feature_path} not found")
129
+ feature_All[file] = torch.load(feature_path)
130
+ keys = self.video_list
131
+ for vidx in range(len(keys)):
132
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
133
+ self.feature_flow_file = None
134
+ else:
135
+ if opt['data_format'] == "h5":
136
+ feature_rgb_file = h5py.File(opt["video_feature_rgb_test"], 'r')
137
+ self.feature_rgb_file = {}
138
+ keys = self.video_list
139
+ for vidx in range(len(keys)):
140
+ if keys[vidx] not in feature_rgb_file:
141
+ raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_test']}")
142
+ self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
143
+ if opt['rgb_only']:
144
+ self.feature_flow_file = None
145
+ else:
146
+ self.feature_flow_file = {}
147
+ feature_flow_file = h5py.File(opt["video_feature_flow_test"], 'r')
148
+ for vidx in range(len(keys)):
149
+ if keys[vidx] not in feature_flow_file:
150
+ raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_test']}")
151
+ self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
152
+ elif opt['data_format'] == "pickle":
153
+ feature_All = pickle.load(open(opt["video_feature_all_test"], 'rb'))
154
+ self.feature_rgb_file = {}
155
+ self.feature_flow_file = {}
156
+ keys = self.video_list
157
+ for vidx in range(len(keys)):
158
+ if keys[vidx] not in feature_All:
159
+ raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_test']}")
160
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
161
+ self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
162
+ elif opt['data_format'] == "npz":
163
+ feature_All = {}
164
+ self.feature_rgb_file = {}
165
+ self.feature_flow_file = {}
166
+ for file in self.video_list:
167
+ feature_path = opt["video_feature_all_test"] + file + '.npz'
168
+ if not os.path.exists(feature_path):
169
+ raise ValueError(f"Feature file {feature_path} not found")
170
+ feature_All[file] = np.load(feature_path)['feats']
171
+ keys = self.video_list
172
+ for vidx in range(len(keys)):
173
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
174
+ self.feature_flow_file = None
175
+ elif opt['data_format'] == "npz_i3d":
176
+ feature_All = {}
177
+ self.feature_rgb_file = {}
178
+ self.feature_flow_file = {}
179
+ for file in self.video_list:
180
+ feature_path = opt["video_feature_all_test"] + file + '.npz'
181
+ if not os.path.exists(feature_path):
182
+ raise ValueError(f"Feature file {feature_path} not found")
183
+ feature_All[file] = np.load(feature_path)
184
+ keys = self.video_list
185
+ for vidx in range(len(keys)):
186
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
187
+ self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
188
+ elif opt['data_format'] == "pt":
189
+ feature_All = {}
190
+ self.feature_rgb_file = {}
191
+ self.feature_flow_file = {}
192
+ for file in self.video_list:
193
+ feature_path = opt["video_feature_all_test"] + file + '.pt'
194
+ if not os.path.exists(feature_path):
195
+ raise ValueError(f"Feature file {feature_path} not found")
196
+ feature_All[file] = torch.load(feature_path)
197
+ keys = self.video_list
198
+ for vidx in range(len(keys)):
199
+ self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
200
+ self.feature_flow_file = None
201
+
202
+ def _loadFeaturelen(self, opt):
203
+ if os.path.exists(self.video_len_path):
204
+ self.video_len = load_json(self.video_len_path)
205
+ return
206
+
207
+ self.video_len = {}
208
+ if self.subset == "train":
209
+ if opt['data_format'] == "h5":
210
+ feature_file = h5py.File(opt["video_feature_rgb_train"], 'r')
211
+ elif opt['data_format'] == "pickle":
212
+ feature_file = pickle.load(open(opt["video_feature_all_train"], 'rb'))
213
+ elif opt['data_format'] == "npz":
214
+ feature_file = {}
215
+ for file in self.video_list:
216
+ feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')['feats']
217
+ elif opt['data_format'] == "npz_i3d":
218
+ feature_file = {}
219
+ for file in self.video_list:
220
+ feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')
221
+ elif opt['data_format'] == "pt":
222
+ feature_file = {}
223
+ for file in self.video_list:
224
+ feature_file[file] = torch.load(opt["video_feature_all_train"] + file + '.pt')
225
+ else:
226
+ if opt['data_format'] == "h5":
227
+ feature_file = h5py.File(opt["video_feature_rgb_test"], 'r')
228
+ elif opt['data_format'] == "pickle":
229
+ feature_file = pickle.load(open(opt["video_feature_all_test"], 'rb'))
230
+ elif opt['data_format'] == "npz":
231
+ feature_file = {}
232
+ for file in self.video_list:
233
+ feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')['feats']
234
+ elif opt['data_format'] == "npz_i3d":
235
+ feature_file = {}
236
+ for file in self.video_list:
237
+ feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')
238
+ elif opt['data_format'] == "pt":
239
+ feature_file = {}
240
+ for file in self.video_list:
241
+ feature_file[file] = torch.load(opt["video_feature_all_test"] + file + '.pt')
242
+
243
+ keys = self.video_list
244
+ if opt['data_format'] == "h5":
245
+ for vidx in range(len(keys)):
246
+ self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
247
+ elif opt['data_format'] == "pickle":
248
+ for vidx in range(len(keys)):
249
+ self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
250
+ elif opt['data_format'] == "npz":
251
+ for vidx in range(len(keys)):
252
+ self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
253
+ elif opt['data_format'] == "npz_i3d":
254
+ for vidx in range(len(keys)):
255
+ self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
256
+ elif opt['data_format'] == "pt":
257
+ for vidx in range(len(keys)):
258
+ self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
259
+ outfile = open(self.video_len_path, "w")
260
+ json.dump(self.video_len, outfile, indent=2)
261
+ outfile.close()
262
+
263
+ def _getDatasetDict(self):
264
+ anno_database = load_json(self.video_anno_path)
265
+ anno_database = anno_database['database']
266
+ self.video_dict = {}
267
+ if self.single_video_name:
268
+ if self.single_video_name in anno_database:
269
+ video_info = anno_database[self.single_video_name]
270
+ video_subset = video_info['subset']
271
+ if self.subset == "full" or self.subset in video_subset:
272
+ self.video_dict[self.single_video_name] = video_info
273
+ for seg in video_info['annotations']:
274
+ if not seg['label'] in self.label_name:
275
+ self.label_name.append(seg['label'])
276
+ else:
277
+ raise ValueError(f"Video {self.single_video_name} not found in annotation database")
278
+ else:
279
+ for video_name in anno_database:
280
+ video_info = anno_database[video_name]
281
+ video_subset = anno_database[video_name]['subset']
282
+ if self.subset == "full" or self.subset in video_subset:
283
+ self.video_dict[video_name] = video_info
284
+ for seg in video_info['annotations']:
285
+ if not seg['label'] in self.label_name:
286
+ self.label_name.append(seg['label'])
287
+
288
+ # Ensure all 22 EGTEA action classes are included
289
+ expected_labels = [
290
+ 'Clean/Wipe', 'Close', 'Compress', 'Crack', 'Cut', 'Divide/Pull Apart',
291
+ 'Dry', 'Inspect/Read', 'Mix', 'Move Around', 'Open', 'Operate', 'Other',
292
+ 'Pour', 'Put', 'Squeeze', 'Take', 'Transfer', 'Turn off', 'Turn on', 'Wash',
293
+ 'Spread' # Assumed missing label; replace with actual label if known
294
+ ]
295
+ for label in expected_labels:
296
+ if label not in self.label_name:
297
+ self.label_name.append(label)
298
+
299
+ self.label_name.sort()
300
+ self.video_list = list(self.video_dict.keys())
301
+ print(f"Labels in dataset.label_name: {self.label_name}")
302
+ print(f"Number of labels: {len(self.label_name)}, Expected: {self.num_of_class-1}")
303
+ print(f"{self.subset} subset video numbers: {len(self.video_list)}")
304
+
305
+ def _getMatchScore(self):
306
+ self.action_end_count = torch.zeros(2)
307
+ for index in range(0, len(self.video_list)):
308
+ video_name = self.video_list[index]
309
+ video_info = self.video_dict[video_name]
310
+ video_labels = video_info['annotations']
311
+ gt_bbox = []
312
+ gt_edlen = []
313
+
314
+ second_to_frame = self.video_len[video_name] / float(video_info['duration'])
315
+ for j in range(len(video_labels)):
316
+ tmp_info = video_labels[j]
317
+ tmp_start = tmp_info['segment'][0] * second_to_frame
318
+ tmp_end = tmp_info['segment'][1] * second_to_frame
319
+ tmp_label = self.label_name.index(tmp_info['label'])
320
+ gt_bbox.append([tmp_start, tmp_end, tmp_label])
321
+ gt_edlen.append([gt_bbox[-1][1], gt_bbox[-1][1] - gt_bbox[-1][0], tmp_label])
322
+
323
+ gt_bbox = np.array(gt_bbox)
324
+ gt_edlen = np.array(gt_edlen)
325
+ self.gt_action[video_name] = gt_edlen
326
+
327
+ match_score = np.zeros((self.video_len[video_name], self.num_of_class - 1), dtype=np.float32)
328
+ for idx in range(gt_bbox.shape[0]):
329
+ ed = int(gt_bbox[idx, 1]) + 1
330
+ st = int(gt_bbox[idx, 0])
331
+ match_score[st:ed, int(gt_bbox[idx, 2])] = idx + 1
332
+ self.match_score[video_name] = match_score
333
+
334
+ def _makeInputSeq(self):
335
+ data_idx = 0
336
+ for index in range(0, len(self.video_list)):
337
+ video_name = self.video_list[index]
338
+ duration = self.match_score[video_name].shape[0]
339
+ for i in range(1, duration + 1):
340
+ st = i - self.segment_size
341
+ ed = i
342
+ self.inputs_all.append([video_name, st, ed, data_idx])
343
+ data_idx += 1
344
+
345
+ self.inputs = self.inputs_all.copy()
346
+ print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
347
+
348
+ def _makePropLabelUnit(self, i):
349
+ video_name = self.inputs_all[i][0]
350
+ st = self.inputs_all[i][1]
351
+ ed = self.inputs_all[i][2]
352
+ cls_anc = []
353
+ reg_anc = []
354
+
355
+ for j in range(0, len(self.anchors)):
356
+ v1 = np.zeros(self.num_of_class)
357
+ v1[-1] = 1
358
+ v2 = np.zeros(2)
359
+ v2[-1] = -1e3
360
+ y_box = [ed - 1, self.anchors[j]]
361
+
362
+ subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[j], ed)
363
+ idx_list = []
364
+ for ii in range(0, subset_label.shape[0]):
365
+ for jj in range(0, subset_label.shape[1]):
366
+ idx = int(subset_label[ii, jj])
367
+ if idx > 0 and idx - 1 not in idx_list:
368
+ idx_list.append(idx - 1)
369
+
370
+ for idx in idx_list:
371
+ target_box = self.gt_action[video_name][idx]
372
+ cls = int(target_box[2])
373
+ iou = calc_iou(y_box, target_box)
374
+ if iou >= self.pos_threshold or (j == len(self.anchors) - 1 and box_include(y_box, target_box)) or (j == 0 and box_include(target_box, y_box)):
375
+ v1[cls] = 1
376
+ v1[-1] = 0
377
+ v2[0] = 1.0 * (target_box[0] - y_box[0]) / self.anchors[j]
378
+ v2[1] = np.log(1.0 * max(1, target_box[1]) / y_box[1])
379
+
380
+ cls_anc.append(v1)
381
+ reg_anc.append(v2)
382
+
383
+ v0 = np.zeros(self.num_of_class)
384
+ v0[-1] = 1
385
+ segment_size = ed - st
386
+ y_box = [ed - 1, self.anchors[-1]]
387
+ subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[-1], ed)
388
+ idx_list = []
389
+ for ii in range(0, subset_label.shape[0]):
390
+ for jj in range(0, subset_label.shape[1]):
391
+ idx = int(subset_label[ii, jj])
392
+ if idx > 0 and idx - 1 not in idx_list:
393
+ idx_list.append(idx - 1)
394
+
395
+ for idx in idx_list:
396
+ target_box = self.gt_action[video_name][idx]
397
+ cls = int(target_box[2])
398
+ iou = calc_iou(y_box, target_box)
399
+ if iou >= 0:
400
+ v0[cls] = 1
401
+ v0[-1] = 0
402
+
403
+ cls_anc = np.stack(cls_anc, axis=0)
404
+ reg_anc = np.stack(reg_anc, axis=0)
405
+ cls_snip = np.array(v0)
406
+ return cls_anc, reg_anc, cls_snip
407
+
408
+ def _loadPropLabel(self, filename):
409
+ if os.path.exists(filename):
410
+ prop_label_file = h5py.File(filename, 'r')
411
+ self.cls_label = np.array(prop_label_file['cls_label'][:])
412
+ self.reg_label = np.array(prop_label_file['reg_label'][:])
413
+ self.snip_label = np.array(prop_label_file['snip_label'][:])
414
+ prop_label_file.close()
415
+ self.action_frame_count = np.sum(self.cls_label.reshape((-1, self.cls_label.shape[-1])), axis=0)
416
+ self.action_frame_count = torch.Tensor(self.action_frame_count)
417
+ return
418
+
419
+ pool = Pool(os.cpu_count() // 2)
420
+ labels = pool.map(self._makePropLabelUnit, range(0, len(self.inputs_all)))
421
+ pool.close()
422
+ pool.join()
423
+
424
+ cls_label = []
425
+ reg_label = []
426
+ snip_label = []
427
+ for i in range(0, len(labels)):
428
+ cls_label.append(labels[i][0])
429
+ reg_label.append(labels[i][1])
430
+ snip_label.append(labels[i][2])
431
+ self.cls_label = np.stack(cls_label, axis=0)
432
+ self.reg_label = np.stack(reg_label, axis=0)
433
+ self.snip_label = np.stack(snip_label, axis=0)
434
+
435
+ outfile = h5py.File(filename, 'w')
436
+ dset_cls = outfile.create_dataset('/cls_label', self.cls_label.shape, maxshape=self.cls_label.shape, chunks=True, dtype=np.float32)
437
+ dset_cls[:, :] = self.cls_label[:, :]
438
+ dset_reg = outfile.create_dataset('/reg_label', self.reg_label.shape, maxshape=self.reg_label.shape, chunks=True, dtype=np.float32)
439
+ dset_reg[:, :] = self.reg_label[:, :]
440
+ dset_snip = outfile.create_dataset('/snip_label', self.snip_label.shape, maxshape=self.snip_label.shape, chunks=True, dtype=np.float32)
441
+ dset_snip[:, :] = self.snip_label[:, :]
442
+ outfile.close()
443
+
444
+ return
445
+
446
+ def __getitem__(self, index):
447
+ video_name, st, ed, data_idx = self.inputs[index]
448
+ if st >= 0:
449
+ feature = self._get_base_data(video_name, st, ed)
450
+ else:
451
+ feature = self._get_base_data(video_name, 0, ed)
452
+ padfunc2d = torch.nn.ConstantPad2d((0, 0, -st, 0), 0)
453
+ feature = padfunc2d(feature)
454
+
455
+ cls_label = torch.Tensor(self.cls_label[data_idx])
456
+ reg_label = torch.Tensor(self.reg_label[data_idx])
457
+ snip_label = torch.Tensor(self.snip_label[data_idx])
458
+
459
+ return feature, cls_label, reg_label, snip_label
460
+
461
+ def _get_base_data(self, video_name, st, ed):
462
+ feature_rgb = self.feature_rgb_file[video_name]
463
+ feature_rgb = feature_rgb[st:ed, :]
464
+
465
+ if self.feature_flow_file is not None:
466
+ feature_flow = self.feature_flow_file[video_name]
467
+ feature_flow = feature_flow[st:ed, :]
468
+ feature = np.append(feature_rgb, feature_flow, axis=1)
469
+ else:
470
+ feature = feature_rgb
471
+ feature = torch.from_numpy(np.array(feature))
472
+
473
+ return feature
474
+
475
+ def _get_train_label_with_class(self, video_name, st, ed):
476
+ duration = len(self.match_score[video_name])
477
+ st_padding = 0
478
+ ed_padding = 0
479
+ if st < 0:
480
+ st_padding = -st
481
+ st = 0
482
+ if ed > duration:
483
+ ed_padding = ed - duration
484
+ ed = duration
485
+
486
+ match_score = torch.Tensor(self.match_score[video_name][st:ed])
487
+ if st_padding > 0:
488
+ padfunc2d = torch.nn.ConstantPad2d((0, 0, st_padding, 0), 0)
489
+ match_score = padfunc2d(match_score)
490
+ if ed_padding > 0:
491
+ padfunc2d = torch.nn.ConstantPad2d((0, 0, 0, ed_padding), 0)
492
+ match_score = padfunc2d(match_score)
493
+ return match_score
494
+
495
+ def __len__(self):
496
+ return len(self.inputs)
497
+
498
+ def reset_sample(self):
499
+ self.inputs = self.inputs_all.copy()
500
+
501
+ def select_sample(self, idx):
502
+ inputs = [self.inputs_all[i] for i in idx]
503
+ self.inputs = inputs.copy()
504
+ return
505
+
506
+ class SuppressDataSet(data.Dataset):
507
+ def __init__(self, opt, subset="train"):
508
+ self.subset = subset
509
+ self.mode = opt["mode"]
510
+ self.data_file = h5py.File(opt["suppress_label_file"].format(self.subset + "_" + opt['setup']), 'r')
511
+ self.video_list = list(self.data_file.keys())
512
+ self.inputs = []
513
+ for index in range(0, len(self.video_list)):
514
+ video_name = self.video_list[index]
515
+ duration = self.data_file[video_name + '/input'].shape[0]
516
+ for i in range(0, duration):
517
+ self.inputs.append([video_name, i])
518
+
519
+ print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
520
+
521
+ def __getitem__(self, index):
522
+ video_name, idx = self.inputs[index]
523
+
524
+ input_seq = self.data_file[video_name + '/input'][idx]
525
+ label = self.data_file[video_name + '/label'][idx]
526
+
527
+ input_seq = torch.from_numpy(input_seq)
528
+ label = torch.from_numpy(label)
529
+
530
+ return input_seq, label
531
+
532
+ def __len__(self):
533
+ return len(self.inputs)
eval.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import sys
3
+ sys.path.append('./Evaluation')
4
+ from eval_detection_gentime import ANETdetection
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+
8
+ def run_evaluation_detection(opt, ground_truth_filename, prediction_filename,
9
+ tiou_thresholds=np.linspace(0.5, 0.95, 10),
10
+ subset='validation', verbose=True):
11
+
12
+ anet_detection = ANETdetection(opt, ground_truth_filename, prediction_filename,
13
+ subset=subset, tiou_thresholds=tiou_thresholds,
14
+ verbose=verbose, check_status=False)
15
+ anet_detection.evaluate()
16
+
17
+ ap = anet_detection.ap
18
+ mAP = anet_detection.mAP
19
+ tdiff = anet_detection.tdiff
20
+
21
+ return (mAP, ap, tdiff)
22
+
23
+ def evaluation_detection(opt, verbose=True):
24
+
25
+ mAP, AP, tdiff = run_evaluation_detection(
26
+ opt,
27
+ opt["video_anno"].format(opt["split"]),
28
+ opt["result_file"].format(opt['exp']),
29
+ tiou_thresholds=np.linspace(0.1, 0.50, 5),
30
+ subset=opt['inference_subset'], verbose=verbose)
31
+
32
+ if verbose:
33
+ print('mAP')
34
+ print(mAP)
35
+ print('AEDT')
36
+ print(tdiff)
37
+
38
+ return mAP
39
+
feature_extractor.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.i3d.extract_i3d import ExtractI3D
2
+ from utils.utils import build_cfg_path
3
+ from omegaconf import OmegaConf
4
+ import torch
5
+ from tqdm import tqdm
6
+ import os
7
+ import numpy as np
8
+
9
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ print(torch.cuda.get_device_name(0))
11
+ # Select the feature type
12
+ feature_type = 'i3d'
13
+
14
+ # Load and patch the config
15
+ args = OmegaConf.load(build_cfg_path(feature_type))
16
+ args.step_size = 12
17
+ args.flow_type = 'raft' # 'pwc'
18
+
19
+ # Load the model
20
+ extractor = ExtractI3D(args)
21
+
22
+ args.video_paths = os.listdir('./Videos')
23
+
24
+ # Extract features
25
+ for video_path in tqdm(args.video_paths):
26
+ print(f'Extracting for {video_path}')
27
+ feature_dict = extractor.extract('./Videos/'+video_path)
28
+ np.savez('./I3D/'+video_path[:-4]+'.npz', **feature_dict)
29
+ [(print(k), print(v.shape)) for k, v in feature_dict.items()]
iou_utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def non_max_suppression(proposals, overlapThresh=0.3):
4
+ # if there are no intervals, return an empty list
5
+ if len(proposals) == 0:
6
+ return []
7
+
8
+ # initialize the list of picked indexes
9
+ pick = []
10
+
11
+ sorted_proposal = sorted(proposals, key=lambda proposal:proposal['score'], reverse=True)
12
+ idx=0
13
+ total_proposal= len(sorted_proposal)
14
+ while idx < total_proposal:
15
+ proposal = sorted_proposal[idx]
16
+ st = proposal['segment'][0]
17
+ ed = proposal['segment'][1]
18
+ label = proposal['label']
19
+
20
+ delete_item = []
21
+ for j in range(idx+1, total_proposal):
22
+ target_proposal = sorted_proposal[j]
23
+ target_st = target_proposal['segment'][0]
24
+ target_ed = target_proposal['segment'][1]
25
+ target_label = target_proposal['label']
26
+
27
+ if(label == target_label):
28
+ sst = np.minimum(st, target_st)
29
+ led = np.maximum(ed, target_ed)
30
+ lst = np.maximum(st, target_st)
31
+ sed = np.minimum(ed, target_ed)
32
+
33
+ iou = (sed-lst) / max(led-sst,1)
34
+ if(iou > overlapThresh):
35
+ delete_item.append(target_proposal)
36
+
37
+ for item in delete_item:
38
+ sorted_proposal.remove(item)
39
+ total_proposal=len(sorted_proposal)
40
+ idx+=1
41
+
42
+ return sorted_proposal
43
+
44
+
45
+ def check_overlap_proposal(proposal_list, new_proposal, overlapThresh=0.3):
46
+ for proposal in proposal_list:
47
+ st = proposal['segment'][0]
48
+ ed = proposal['segment'][1]
49
+ label = proposal['label']
50
+
51
+ new_st = new_proposal['segment'][0]
52
+ new_ed = new_proposal['segment'][1]
53
+ new_label = new_proposal['label']
54
+
55
+ if(label == new_label):
56
+ sst = np.minimum(st, new_st)
57
+ led = np.maximum(ed, new_ed)
58
+ lst = np.maximum(st, new_st)
59
+ sed = np.minimum(ed, new_ed)
60
+
61
+ iou = (sed-lst) / max(led-sst,1)
62
+ if(iou > overlapThresh):
63
+ return proposal
64
+
65
+ return None
loss_func.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import torch.distributed as dist
6
+ from functools import partial
7
+
8
+ class MultiCrossEntropyLoss(nn.Module):
9
+ def __init__(self, focal=False, weight=None, reduce=True):
10
+ super(MultiCrossEntropyLoss, self).__init__()
11
+ self.num_classes = 23
12
+ self.focal = focal
13
+ self.weight= weight
14
+ self.reduce = reduce
15
+ self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
16
+ self.gamma_f = 0.05
17
+
18
+ self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
19
+ self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
20
+ self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
21
+
22
+ def forward(self, input, target):
23
+ target_sum = torch.sum(target, dim=1)
24
+ target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
25
+ target = target/target_div
26
+ logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
27
+ gamma = self.gamma_.clone()
28
+ gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
29
+
30
+ if not self.focal:
31
+ if self.weight is None:
32
+ output = torch.sum(-target * logsoftmax(input), 1)
33
+ else:
34
+ output = torch.sum(-target * logsoftmax(input) /self.weight, 1)
35
+ else:
36
+ softmax = nn.Softmax(dim=1).to(input.device)
37
+ p = softmax(input)
38
+
39
+ output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
40
+
41
+
42
+ if self.reduce:
43
+ return torch.mean(output)
44
+ else:
45
+ return output
46
+
47
+
48
+ def map_func(self, x, s):
49
+ min_val = torch.min(x)
50
+ max_val = torch.max(x)
51
+ mu = torch.mean(x)
52
+ x = (x - min_val) / (max_val - min_val)
53
+ return 1 / (1 + torch.exp(-s * (x - mu)))
54
+
55
+ def collect_grad(self, target, grad):
56
+ grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
57
+ target = target.reshape(-1, target.shape[-1]).cuda()
58
+ pos_grad = torch.sum(grad * target, dim=0)[:-1]
59
+ neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
60
+ self.pos_grad += pos_grad
61
+ self.neg_grad += neg_grad
62
+ self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
63
+ self.pos_neg = self.map_func(self.pos_neg, 1)
64
+
65
+
66
+ def cls_loss_func(y,output, use_focal=False, weight=None, reduce=True):
67
+ input_size=y.size()
68
+ y = y.float().cuda()
69
+ if weight is not None:
70
+ weight = weight.cuda()
71
+ loss_func = MultiCrossEntropyLoss(focal=True, weight=weight, reduce=reduce)
72
+
73
+ y=y.reshape(-1,y.size(-1))
74
+ output=output.reshape(-1,output.size(-1))
75
+ loss = loss_func(output,y)
76
+
77
+ if not reduce:
78
+ loss = loss.reshape(input_size[:-1])
79
+
80
+ return loss
81
+
82
+
83
+ def cls_loss_func_(loss_func, y,output, use_focal=False, weight=None, reduce=True):
84
+ input_size=y.size()
85
+ y = y.float().cuda()
86
+ if weight is not None:
87
+ weight = weight.cuda()
88
+
89
+ y=y.reshape(-1,y.size(-1))
90
+ output=output.reshape(-1,output.size(-1))
91
+ loss = loss_func(output,y)
92
+
93
+ if not reduce:
94
+ loss = loss.reshape(input_size[:-1])
95
+
96
+ return loss
97
+
98
+ def regress_loss_func(y,output):
99
+ y = y.float().cuda()
100
+ y=y.reshape(-1,y.size(-1))
101
+ output=output.reshape(-1,output.size(-1))
102
+
103
+ bgmask= y[:,1] < -1e2
104
+
105
+ fg_logits = output[~bgmask]
106
+ bg_logits = output[bgmask]
107
+
108
+ fg_target = y[~bgmask]
109
+ bg_target = y[bgmask]
110
+
111
+ loss = nn.functional.l1_loss(fg_logits,fg_target)
112
+
113
+ if(loss.isnan()):
114
+ return torch.tensor([0.0], requires_grad=True).cuda()
115
+ return loss
116
+
117
+
118
+ def suppress_loss_func(y,output):
119
+ y = y.float().cuda()
120
+ y=y.reshape(-1,y.size(-1))
121
+ output=output.reshape(-1,output.size(-1))
122
+
123
+ loss = nn.functional.binary_cross_entropy(output,y)
124
+
125
+ return loss
126
+
127
+
128
+ # import torch
129
+ # import numpy as np
130
+ # import torch.nn as nn
131
+ # import torch.nn.functional as F
132
+ # import torch.distributed as dist
133
+ # from functools import partial
134
+
135
+ # class MultiCrossEntropyLoss(nn.Module):
136
+ # def __init__(self, focal=False, weight=None, reduce=True):
137
+ # super(MultiCrossEntropyLoss, self).__init__()
138
+ # self.num_classes = 23
139
+ # self.focal = focal
140
+ # self.weight= weight
141
+ # self.reduce = reduce
142
+ # self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
143
+ # self.gamma_f = 0.05
144
+
145
+ # self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
146
+ # self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
147
+ # self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
148
+
149
+ # def forward(self, input, target):
150
+ # target_sum = torch.sum(target, dim=1)
151
+ # target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
152
+ # target = target/target_div
153
+ # logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
154
+ # gamma = self.gamma_.clone()
155
+ # gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
156
+
157
+ # if not self.focal:
158
+ # if self.weight is None:
159
+ # output = torch.sum(-target * logsoftmax(input), 1)
160
+ # else:
161
+ # output = torch.sum(-target * logsoftmax(input) /self.weight, 1)
162
+ # else:
163
+ # softmax = nn.Softmax(dim=1).to(input.device)
164
+ # p = softmax(input)
165
+
166
+ # output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
167
+
168
+
169
+ # if self.reduce:
170
+ # return torch.mean(output)
171
+ # else:
172
+ # return output
173
+
174
+
175
+ # def map_func(self, x, s):
176
+ # min_val = torch.min(x)
177
+ # max_val = torch.max(x)
178
+ # mu = torch.mean(x)
179
+ # x = (x - min_val) / (max_val - min_val)
180
+ # return 1 / (1 + torch.exp(-s * (x - mu)))
181
+
182
+ # def collect_grad(self, target, grad):
183
+ # grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
184
+ # target = target.reshape(-1, target.shape[-1]).cuda()
185
+ # pos_grad = torch.sum(grad * target, dim=0)[:-1]
186
+ # neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
187
+ # self.pos_grad += pos_grad
188
+ # self.neg_grad += neg_grad
189
+ # self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
190
+ # self.pos_neg = self.map_func(self.pos_neg, 1)
191
+
192
+
193
+ # def cls_loss_func(y,output, use_focal=False, weight=None, reduce=True):
194
+ # input_size=y.size()
195
+ # y = y.float().cuda()
196
+ # if weight is not None:
197
+ # weight = weight.cuda()
198
+ # loss_func = MultiCrossEntropyLoss(focal=True, weight=weight, reduce=reduce)
199
+
200
+ # y=y.reshape(-1,y.size(-1))
201
+ # output=output.reshape(-1,output.size(-1))
202
+ # loss = loss_func(output,y)
203
+
204
+ # if not reduce:
205
+ # loss = loss.reshape(input_size[:-1])
206
+
207
+ # return loss
208
+
209
+
210
+ # def cls_loss_func_(loss_func, y,output, use_focal=False, weight=None, reduce=True):
211
+ # input_size=y.size()
212
+ # y = y.float().cuda()
213
+ # if weight is not None:
214
+ # weight = weight.cuda()
215
+
216
+ # y=y.reshape(-1,y.size(-1))
217
+ # output=output.reshape(-1,output.size(-1))
218
+ # loss = loss_func(output,y)
219
+
220
+ # if not reduce:
221
+ # loss = loss.reshape(input_size[:-1])
222
+
223
+ # return loss
224
+
225
+ # def regress_loss_func(y,output):
226
+ # y = y.float().cuda()
227
+ # y=y.reshape(-1,y.size(-1))
228
+ # output=output.reshape(-1,output.size(-1))
229
+
230
+ # bgmask= y[:,1] < -1e2
231
+
232
+ # fg_logits = output[~bgmask]
233
+ # bg_logits = output[bgmask]
234
+
235
+ # fg_target = y[~bgmask]
236
+ # bg_target = y[bgmask]
237
+
238
+ # loss = nn.functional.l1_loss(fg_logits,fg_target)
239
+
240
+ # if(loss.isnan()):
241
+ # return torch.tensor([0.0], requires_grad=True).cuda()
242
+ # return loss
243
+
244
+
245
+ # def suppress_loss_func(y,output):
246
+ # y = y.float().cuda()
247
+ # y=y.reshape(-1,y.size(-1))
248
+ # output=output.reshape(-1,output.size(-1))
249
+
250
+ # loss = nn.functional.binary_cross_entropy(output,y)
251
+
252
+ # return loss
253
+
254
+
255
+
256
+ # import torch
257
+ # import numpy as np
258
+ # import torch.nn as nn
259
+ # import torch.nn.functional as F
260
+ # import torch.distributed as dist
261
+ # from functools import partial
262
+
263
+ # class MultiCrossEntropyLoss(nn.Module):
264
+ # def __init__(self, num_classes, focal=False, weight=None, reduce=True):
265
+ # super(MultiCrossEntropyLoss, self).__init__()
266
+ # self.num_classes = num_classes # Use the provided num_classes
267
+ # self.focal = focal
268
+ # self.weight = weight
269
+ # self.reduce = reduce
270
+ # self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
271
+ # self.gamma_f = 0.05
272
+
273
+ # self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
274
+ # self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
275
+ # self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
276
+
277
+ # def forward(self, input, target):
278
+ # target_sum = torch.sum(target, dim=1)
279
+ # target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
280
+ # target = target / target_div
281
+ # logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
282
+ # gamma = self.gamma_.clone()
283
+ # gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
284
+
285
+ # if not self.focal:
286
+ # if self.weight is None:
287
+ # output = torch.sum(-target * logsoftmax(input), 1)
288
+ # else:
289
+ # output = torch.sum(-target * logsoftmax(input) / self.weight, 1)
290
+ # else:
291
+ # softmax = nn.Softmax(dim=1).to(input.device)
292
+ # p = softmax(input)
293
+ # output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
294
+
295
+ # if self.reduce:
296
+ # return torch.mean(output)
297
+ # else:
298
+ # return output
299
+
300
+ # def map_func(self, x, s):
301
+ # min_val = torch.min(x)
302
+ # max_val = torch.max(x)
303
+ # mu = torch.mean(x)
304
+ # x = (x - min_val) / (max_val - min_val)
305
+ # return 1 / (1 + torch.exp(-s * (x - mu)))
306
+
307
+ # def collect_grad(self, target, grad):
308
+ # grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
309
+ # target = target.reshape(-1, target.shape[-1]).cuda()
310
+ # pos_grad = torch.sum(grad * target, dim=0)[:-1]
311
+ # neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
312
+ # self.pos_grad += pos_grad
313
+ # self.neg_grad += neg_grad
314
+ # self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
315
+ # self.pos_neg = self.map_func(self.pos_neg, 1)
316
+
317
+ # def cls_loss_func(y, output, use_focal=False, weight=None, reduce=True):
318
+ # input_size = y.size()
319
+ # y = y.float().cuda()
320
+ # if weight is not None:
321
+ # weight = weight.cuda()
322
+ # loss_func = MultiCrossEntropyLoss(num_classes=y.size(-1), focal=use_focal, weight=weight, reduce=reduce)
323
+
324
+ # y = y.reshape(-1, y.size(-1))
325
+ # output = output.reshape(-1, output.size(-1))
326
+ # loss = loss_func(output, y)
327
+
328
+ # if not reduce:
329
+ # loss = loss.reshape(input_size[:-1])
330
+
331
+ # return loss
332
+
333
+ # def cls_loss_func_(loss_func, y, output, use_focal=False, weight=None, reduce=True):
334
+ # input_size = y.size()
335
+ # y = y.float().cuda()
336
+ # if weight is not None:
337
+ # weight = weight.cuda()
338
+
339
+ # y = y.reshape(-1, y.size(-1))
340
+ # output = output.reshape(-1, output.size(-1))
341
+ # loss = loss_func(output, y)
342
+
343
+ # if not reduce:
344
+ # loss = loss.reshape(input_size[:-1])
345
+
346
+ # return loss
347
+
348
+ # def regress_loss_func(y, output):
349
+ # y = y.float().cuda()
350
+ # y = y.reshape(-1, y.size(-1))
351
+ # output = output.reshape(-1, output.size(-1))
352
+
353
+ # bgmask = y[:, 1] < -1e2
354
+
355
+ # fg_logits = output[~bgmask]
356
+ # bg_logits = output[bgmask]
357
+
358
+ # fg_target = y[~bgmask]
359
+ # bg_target = y[bgmask]
360
+
361
+ # loss = nn.functional.l1_loss(fg_logits, fg_target)
362
+
363
+ # if loss.isnan():
364
+ # return torch.tensor([0.0], requires_grad=True).cuda()
365
+ # return loss
366
+
367
+ # def suppress_loss_func(y, output):
368
+ # y = y.float().cuda()
369
+ # y = y.reshape(-1, y.size(-1))
370
+ # output = output.reshape(-1, output.size(-1))
371
+
372
+ # loss = nn.functional.binary_cross_entropy(output, y)
373
+
374
+ # return loss
models.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import math
4
+ from torch.autograd import Variable
5
+ import torch.nn.functional as F
6
+ import torch.nn as nn
7
+ from torch.nn import init
8
+ from torch.nn.functional import normalize
9
+
10
+
11
+ class PositionalEncoding(nn.Module):
12
+ def __init__(self,
13
+ emb_size: int,
14
+ dropout: float = 0.1,
15
+ maxlen: int = 750):
16
+ super(PositionalEncoding, self).__init__()
17
+ den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
18
+ pos = torch.arange(0, maxlen).reshape(maxlen, 1)
19
+ pos_embedding = torch.zeros((maxlen, emb_size))
20
+ pos_embedding[:, 0::2] = torch.sin(pos * den)
21
+ pos_embedding[:, 1::2] = torch.cos(pos * den)
22
+ pos_embedding = pos_embedding.unsqueeze(-2)
23
+ self.dropout = nn.Dropout(dropout)
24
+ self.register_buffer('pos_embedding', pos_embedding)
25
+
26
+ def forward(self, token_embedding: torch.Tensor):
27
+ return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
28
+
29
+ class HistoryUnit(torch.nn.Module):
30
+ def __init__(self, opt):
31
+ super(HistoryUnit, self).__init__()
32
+ self.n_feature=opt["feat_dim"]
33
+ n_class=opt["num_of_class"]
34
+ n_embedding_dim=opt["hidden_dim"]
35
+ n_hist_dec_head = 4
36
+ n_hist_dec_layer = 5
37
+ n_hist_dec_head_2 = 4
38
+ n_hist_dec_layer_2 = 2
39
+ self.anchors=opt["anchors"]
40
+ self.history_tokens = 16
41
+ self.short_window_size = 16
42
+ self.anchors_stride=[]
43
+ dropout=0.3
44
+ self.best_loss=1000000
45
+ self.best_map=0
46
+
47
+
48
+ self.history_positional_encoding = PositionalEncoding(n_embedding_dim, dropout, maxlen=400)
49
+
50
+ self.history_encoder_block1 = nn.TransformerDecoder(
51
+ nn.TransformerDecoderLayer(d_model=n_embedding_dim,
52
+ nhead=n_hist_dec_head,
53
+ dropout=dropout,
54
+ activation='gelu'),
55
+ n_hist_dec_layer,
56
+ nn.LayerNorm(n_embedding_dim))
57
+
58
+
59
+ self.history_encoder_block2 = nn.TransformerDecoder(
60
+ nn.TransformerDecoderLayer(d_model=n_embedding_dim,
61
+ nhead=n_hist_dec_head_2,
62
+ dropout=dropout,
63
+ activation='gelu'),
64
+ n_hist_dec_layer_2,
65
+ nn.LayerNorm(n_embedding_dim))
66
+
67
+
68
+
69
+ self.snip_head = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim//4), nn.ReLU())
70
+ self.snip_classifier = nn.Sequential(nn.Linear(self.history_tokens*n_embedding_dim//4, (self.history_tokens*n_embedding_dim//4)//4), nn.ReLU(), nn.Linear((self.history_tokens*n_embedding_dim//4)//4,n_class))
71
+
72
+
73
+ self.history_token = nn.Parameter(torch.zeros(self.history_tokens, 1, n_embedding_dim))
74
+ # self.history_token_extra = nn.Parameter(torch.zeros(self.history_tokens*2, 1, n_embedding_dim))
75
+
76
+ self.norm2 = nn.LayerNorm(n_embedding_dim)
77
+ self.dropout2 = nn.Dropout(0.1)
78
+
79
+
80
+ def forward(self, long_x, encoded_x):
81
+
82
+
83
+ ## History Encoder
84
+ hist_pe_x = self.history_positional_encoding(long_x)
85
+ history_token = self.history_token.expand(-1, hist_pe_x.shape[1], -1)
86
+ hist_encoded_x_1 = self.history_encoder_block1(history_token, hist_pe_x)
87
+ hist_encoded_x_2 = self.history_encoder_block2(hist_encoded_x_1, encoded_x)
88
+ hist_encoded_x_2 = hist_encoded_x_2 + self.dropout2(hist_encoded_x_1)
89
+ hist_encoded_x = self.norm2(hist_encoded_x_2)
90
+
91
+ ## Snippet Classfication Head
92
+ snippet_feat = self.snip_head(hist_encoded_x_1)
93
+ snippet_feat = torch.flatten(snippet_feat.permute(1, 0, 2), start_dim=1)
94
+
95
+ snip_cls = self.snip_classifier(snippet_feat)
96
+
97
+ return hist_encoded_x, snip_cls
98
+
99
+
100
+
101
+ class MYNET(torch.nn.Module):
102
+ def __init__(self, opt):
103
+ super(MYNET, self).__init__()
104
+ self.n_feature=opt["feat_dim"]
105
+ n_class=opt["num_of_class"]
106
+ n_embedding_dim=opt["hidden_dim"]
107
+ n_enc_layer=opt["enc_layer"]
108
+ n_enc_head=opt["enc_head"]
109
+ n_dec_layer=opt["dec_layer"]
110
+ n_dec_head=opt["dec_head"]
111
+ n_comb_dec_head = 4
112
+ n_comb_dec_layer = 5
113
+ n_seglen=opt["segment_size"]
114
+ self.anchors=opt["anchors"]
115
+ self.history_tokens = 16
116
+ self.short_window_size = 16
117
+ self.anchors_stride=[]
118
+ dropout=0.3
119
+ self.best_loss=1000000
120
+ self.best_map=0
121
+
122
+ self.feature_reduction_rgb = nn.Linear(self.n_feature//2, n_embedding_dim//2)
123
+ self.feature_reduction_flow = nn.Linear(self.n_feature//2, n_embedding_dim//2)
124
+
125
+ self.positional_encoding = PositionalEncoding(n_embedding_dim, dropout, maxlen=400)
126
+
127
+ self.encoder = nn.TransformerEncoder(
128
+ nn.TransformerEncoderLayer(d_model=n_embedding_dim,
129
+ nhead=n_enc_head,
130
+ dropout=dropout,
131
+ activation='gelu'),
132
+ n_enc_layer,
133
+ nn.LayerNorm(n_embedding_dim))
134
+
135
+ self.decoder = nn.TransformerDecoder(
136
+ nn.TransformerDecoderLayer(d_model=n_embedding_dim,
137
+ nhead=n_dec_head,
138
+ dropout=dropout,
139
+ activation='gelu'),
140
+ n_dec_layer,
141
+ nn.LayerNorm(n_embedding_dim))
142
+
143
+ self.history_unit = HistoryUnit(opt)
144
+
145
+
146
+ self.history_anchor_decoder_block1 = nn.TransformerDecoder(
147
+ nn.TransformerDecoderLayer(d_model=n_embedding_dim,
148
+ nhead=n_comb_dec_head,
149
+ dropout=dropout,
150
+ activation='gelu'),
151
+ n_comb_dec_layer,
152
+ nn.LayerNorm(n_embedding_dim))
153
+
154
+
155
+ self.classifier = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim), nn.ReLU(), nn.Linear(n_embedding_dim,n_class))
156
+ self.regressor = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim), nn.ReLU(), nn.Linear(n_embedding_dim,2))
157
+
158
+
159
+ self.decoder_token = nn.Parameter(torch.zeros(len(self.anchors), 1, n_embedding_dim))
160
+
161
+
162
+ self.norm1 = nn.LayerNorm(n_embedding_dim)
163
+ self.dropout1 = nn.Dropout(0.1)
164
+
165
+ self.relu = nn.ReLU(True)
166
+ self.softmaxd1 = nn.Softmax(dim=-1)
167
+
168
+ def forward(self, inputs):
169
+ # base_x_rgb = self.feature_reduction_rgb(inputs[:,:,:self.n_feature//2])
170
+ # base_x_flow = self.feature_reduction_flow(inputs[:,:,self.n_feature//2:])
171
+ base_x_rgb = self.feature_reduction_rgb(inputs[:,:,:self.n_feature//2].float())
172
+ base_x_flow = self.feature_reduction_flow(inputs[:,:,self.n_feature//2:].float())
173
+ base_x = torch.cat([base_x_rgb,base_x_flow],dim=-1)
174
+
175
+ base_x = base_x.permute([1,0,2])# seq_len x batch x featsize x
176
+
177
+ short_x = base_x[-self.short_window_size:]
178
+
179
+ long_x = base_x[:-self.short_window_size]
180
+
181
+ ## Anchor Feature Generator
182
+ pe_x = self.positional_encoding(short_x)
183
+ encoded_x = self.encoder(pe_x)
184
+ decoder_token = self.decoder_token.expand(-1, encoded_x.shape[1], -1)
185
+ decoded_x = self.decoder(decoder_token, encoded_x)
186
+ decoded_x = decoded_x
187
+
188
+ ## Future-Supervised History Module
189
+ hist_encoded_x, snip_cls = self.history_unit(long_x, encoded_x)
190
+
191
+
192
+ ## History Driven Anchor Refinement
193
+ decoded_anchor_feat = self.history_anchor_decoder_block1(decoded_x, hist_encoded_x)
194
+ decoded_anchor_feat = decoded_anchor_feat + self.dropout1(decoded_x)
195
+ decoded_anchor_feat = self.norm1(decoded_anchor_feat)
196
+ decoded_anchor_feat = decoded_anchor_feat.permute([1, 0, 2])
197
+
198
+ # Predition Module
199
+ anc_cls = self.classifier(decoded_anchor_feat)
200
+ anc_reg = self.regressor(decoded_anchor_feat)
201
+
202
+ return anc_cls, anc_reg, snip_cls
203
+
204
+
205
+ class SuppressNet(torch.nn.Module):
206
+ def __init__(self, opt):
207
+ super(SuppressNet, self).__init__()
208
+ n_class=opt["num_of_class"]-1
209
+ n_seglen=opt["segment_size"]
210
+ n_embedding_dim=2*n_seglen
211
+ dropout=0.3
212
+ self.best_loss=1000000
213
+ self.best_map=0
214
+ # FC layers for the 2 streams
215
+
216
+ self.mlp1 = nn.Linear(n_seglen, n_embedding_dim)
217
+ self.mlp2 = nn.Linear(n_embedding_dim, 1)
218
+ self.norm = nn.InstanceNorm1d(n_class)
219
+ self.relu = nn.ReLU(True)
220
+ self.sigmoid = nn.Sigmoid()
221
+
222
+ def forward(self, inputs):
223
+ #inputs - batch x seq_len x class
224
+
225
+ base_x = inputs.permute([0,2,1])
226
+ base_x = self.norm(base_x)
227
+ x = self.relu(self.mlp1(base_x))
228
+ x = self.sigmoid(self.mlp2(x))
229
+ x = x.squeeze(-1)
230
+
231
+ return x
232
+
opts_egtea.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ def parse_opt():
4
+ parser = argparse.ArgumentParser()
5
+ # Overall settings
6
+ parser.add_argument('--mode', type=str, default='train')
7
+ parser.add_argument('--video_name', type=str, default=None, help='Name of the single video to evaluate')
8
+ parser.add_argument('--video_path', type=str, default='', help='Path to the input video file for visualization')
9
+ parser.add_argument('--checkpoint_path', type=str, default='./checkpoint')
10
+ parser.add_argument('--segment_size', type=int, default=64)
11
+ parser.add_argument('--anchors', type=str, default='2,4,6,8,12,16')
12
+ parser.add_argument('--seed', default=7, type=int, help='random seed for reproducibility')
13
+
14
+ # Overall Dataset settings
15
+ parser.add_argument('--num_of_class', type=int, default=23)
16
+ parser.add_argument('--data_format', type=str, default="npz_i3d")
17
+ parser.add_argument('--data_rescale', default=False, action='store_true')
18
+ parser.add_argument('--predefined_fps', default=None, type=float)
19
+ parser.add_argument('--rgb_only', default=False, action='store_true')
20
+ parser.add_argument('--video_anno', type=str, default="./data/egtea_annotations_split{}.json")
21
+ parser.add_argument('--video_feature_all_train', type=str, default="./data/I3D/")
22
+ parser.add_argument('--video_feature_all_test', type=str, default="./data/I3D/")
23
+ parser.add_argument('--setup', type=str, default="")
24
+ parser.add_argument('--exp', type=str, default="01")
25
+ parser.add_argument('--split', type=str, default="1")
26
+
27
+ # Network
28
+ parser.add_argument('--feat_dim', type=int, default=2048)
29
+ parser.add_argument('--hidden_dim', type=int, default=1024)
30
+ parser.add_argument('--out_dim', type=int, default=23)
31
+ parser.add_argument('--enc_layer', type=int, default=3)
32
+ parser.add_argument('--enc_head', type=int, default=8)
33
+ parser.add_argument('--dec_layer', type=int, default=5)
34
+ parser.add_argument('--dec_head', type=int, default=4)
35
+
36
+ # Training settings
37
+ parser.add_argument('--batch_size', type=int, default=128)
38
+ parser.add_argument('--lr', type=float, default=1e-4)
39
+ parser.add_argument('--weight_decay', type=float, default=1e-4)
40
+ parser.add_argument('--epoch', type=int, default=5)
41
+ parser.add_argument('--lr_step', type=int, default=3)
42
+
43
+ # Post processing
44
+ parser.add_argument('--alpha', type=float, default=1)
45
+ parser.add_argument('--beta', type=float, default=1)
46
+ parser.add_argument('--gamma', type=float, default=0.2)
47
+ parser.add_argument('--pptype', type=str, default="net")
48
+ parser.add_argument('--pos_threshold', type=float, default=0.5)
49
+ parser.add_argument('--sup_threshold', type=float, default=0.1)
50
+ parser.add_argument('--threshold', type=float, default=0.1)
51
+ parser.add_argument('--inference_subset', type=str, default="test")
52
+ parser.add_argument('--soft_nms', type=float, default=0.3)
53
+ parser.add_argument('--video_len_file', type=str, default="./output/video_len_{}.json")
54
+ parser.add_argument('--proposal_label_file', type=str, default="./output/proposal_label_{}.h5")
55
+ parser.add_argument('--suppress_label_file', type=str, default="./output/suppress_label_{}.h5")
56
+ parser.add_argument('--suppress_result_file', type=str, default="./output/suppress_result{}.h5")
57
+ parser.add_argument('--frame_result_file', type=str, default="./output/frame_result{}.h5")
58
+ parser.add_argument('--result_file', type=str, default="./output/result_proposal{}.json")
59
+ parser.add_argument('--wterm', type=bool, default=False)
60
+
61
+ args = parser.parse_args()
62
+ return args
output/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ If there exist changes in the dataset, it is recommended to delete all files in this folder and execute the main function from the start.
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ h5py
2
+ ipdb
3
+ sklearn
4
+ matplotlib
5
+ tensorboardX
short main.py ADDED
The diff for this file is too large to render. See raw diff
 
supnet.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ import torchvision
5
+ import torch.nn.parallel
6
+ import torch.nn.functional as F
7
+ import torch.optim as optim
8
+ import numpy as np
9
+ import opts_egtea as opts
10
+ import time
11
+ import h5py
12
+ from iou_utils import *
13
+ from eval import evaluation_detection
14
+ from tensorboardX import SummaryWriter
15
+ from dataset import VideoDataSet, SuppressDataSet
16
+ from models import MYNET, SuppressNet
17
+ from loss_func import cls_loss_func, regress_loss_func, suppress_loss_func
18
+ from tqdm import tqdm
19
+
20
+ def train_one_epoch(opt, model, train_dataset, optimizer):
21
+ train_loader = torch.utils.data.DataLoader(train_dataset,
22
+ batch_size=opt['batch_size'], shuffle=True,
23
+ num_workers=0, pin_memory=True,drop_last=False)
24
+ epoch_cost = 0
25
+
26
+ for n_iter,(input_data,label) in enumerate(tqdm(train_loader)):
27
+ suppress_conf = model(input_data.cuda())
28
+
29
+ loss = suppress_loss_func(label,suppress_conf)
30
+ epoch_cost+= loss.detach().cpu().numpy()
31
+
32
+ optimizer.zero_grad()
33
+ loss.backward()
34
+ optimizer.step()
35
+
36
+ return n_iter, epoch_cost
37
+
38
+ def eval_one_epoch(opt, model, test_dataset):
39
+ test_loader = torch.utils.data.DataLoader(test_dataset,
40
+ batch_size=opt['batch_size'], shuffle=False,
41
+ num_workers=0, pin_memory=True,drop_last=False)
42
+ epoch_cost = 0
43
+
44
+ for n_iter,(input_data,label) in enumerate(tqdm(test_loader)):
45
+ suppress_conf = model(input_data.cuda())
46
+
47
+ loss = suppress_loss_func(label,suppress_conf)
48
+ epoch_cost+= loss.detach().cpu().numpy()
49
+
50
+ return n_iter, epoch_cost
51
+
52
+
53
+ def train(opt):
54
+ writer = SummaryWriter()
55
+ model = SuppressNet(opt).cuda()
56
+
57
+ optimizer = optim.Adam( model.parameters(),lr=opt["lr"],weight_decay = opt["weight_decay"])
58
+ scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = opt["lr_step"])
59
+
60
+ train_dataset = SuppressDataSet(opt,subset="train")
61
+ test_dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
62
+
63
+ for n_epoch in range(opt['epoch']):
64
+ n_iter, epoch_cost = train_one_epoch(opt, model, train_dataset, optimizer)
65
+
66
+ writer.add_scalars('sup_data/cost', {'train': epoch_cost/(n_iter+1)}, n_epoch)
67
+ print("training loss(epoch %d): %f, lr - %f"%(n_epoch,
68
+ epoch_cost/(n_iter+1),
69
+ optimizer.param_groups[0]["lr"]) )
70
+
71
+ scheduler.step()
72
+ model.eval()
73
+
74
+ n_iter, eval_cost = eval_one_epoch(opt, model,test_dataset)
75
+
76
+ writer.add_scalars('sup_data/eval', {'test': eval_cost/(n_iter+1)}, n_epoch)
77
+ print("testing loss(epoch %d): %f"%(n_epoch,eval_cost/(n_iter+1)))
78
+
79
+ state = {'epoch': n_epoch + 1,
80
+ 'state_dict': model.state_dict()}
81
+ torch.save(state, opt["checkpoint_path"]+"/checkpoint_suppress_"+str(n_epoch+1)+".pth.tar" )
82
+ if eval_cost < model.best_loss:
83
+ model.best_loss = eval_cost
84
+ torch.save(state, opt["checkpoint_path"]+"/ckp_best_suppress.pth.tar" )
85
+
86
+ model.train()
87
+
88
+ writer.close()
89
+ return
90
+
91
+ def eval_frame(opt, model, dataset):
92
+ test_loader = torch.utils.data.DataLoader(dataset,
93
+ batch_size=opt['batch_size'], shuffle=False,
94
+ num_workers=0, pin_memory=True,drop_last=False)
95
+
96
+ labels_cls={}
97
+ labels_reg={}
98
+ output_cls={}
99
+ output_reg={}
100
+ for video_name in dataset.video_list:
101
+ labels_cls[video_name]=[]
102
+ labels_reg[video_name]=[]
103
+ output_cls[video_name]=[]
104
+ output_reg[video_name]=[]
105
+
106
+ start_time = time.time()
107
+ total_frames =0
108
+ epoch_cost = 0
109
+ epoch_cost_cls = 0
110
+ epoch_cost_reg = 0
111
+
112
+ for n_iter,(input_data,cls_label,reg_label, _) in enumerate(tqdm(test_loader)):
113
+ act_cls, act_reg, _ = model(input_data.cuda())
114
+
115
+ cost_reg = 0
116
+ cost_cls = 0
117
+
118
+ loss = cls_loss_func(cls_label,act_cls)
119
+ cost_cls = loss
120
+
121
+ epoch_cost_cls+= cost_cls.detach().cpu().numpy()
122
+
123
+ loss = regress_loss_func(reg_label,act_reg)
124
+ cost_reg = loss
125
+ epoch_cost_reg += cost_reg.detach().cpu().numpy()
126
+
127
+ cost= opt['alpha']*cost_cls +opt['beta']*cost_reg
128
+
129
+ epoch_cost += cost.detach().cpu().numpy()
130
+
131
+ act_cls = torch.softmax(act_cls, dim=-1)
132
+
133
+ total_frames+=input_data.size(0)
134
+
135
+ for b in range(0,input_data.size(0)):
136
+ video_name, st, ed, data_idx = dataset.inputs[n_iter*opt['batch_size']+b]
137
+ output_cls[video_name]+=[act_cls[b,:].detach().cpu().numpy()]
138
+ output_reg[video_name]+=[act_reg[b,:].detach().cpu().numpy()]
139
+ labels_cls[video_name]+=[cls_label[b,:].numpy()]
140
+ labels_reg[video_name]+=[reg_label[b,:].numpy()]
141
+
142
+ end_time = time.time()
143
+ working_time = end_time-start_time
144
+
145
+ for video_name in dataset.video_list:
146
+ labels_cls[video_name]=np.stack(labels_cls[video_name], axis=0)
147
+ labels_reg[video_name]=np.stack(labels_reg[video_name], axis=0)
148
+ output_cls[video_name]=np.stack(output_cls[video_name], axis=0)
149
+ output_reg[video_name]=np.stack(output_reg[video_name], axis=0)
150
+
151
+ cls_loss=epoch_cost_cls/n_iter
152
+ reg_loss=epoch_cost_reg/n_iter
153
+ tot_loss=epoch_cost/n_iter
154
+
155
+ return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
156
+
157
+
158
+ def test(opt):
159
+ model = SuppressNet(opt).cuda()
160
+ checkpoint = torch.load(opt["checkpoint_path"]+"/" + opt['exp'] + "ckp_best_suppress.pth.tar")
161
+ base_dict=checkpoint['state_dict']
162
+ model.load_state_dict(base_dict)
163
+ model.eval()
164
+
165
+ dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
166
+
167
+ test_loader = torch.utils.data.DataLoader(dataset,
168
+ batch_size=opt['batch_size'], shuffle=False,
169
+ num_workers=0, pin_memory=True,drop_last=False)
170
+ labels={}
171
+ output={}
172
+ for video_name in dataset.video_list:
173
+ labels[video_name]=[]
174
+ output[video_name]=[]
175
+
176
+ for n_iter,(input_data,label) in enumerate(test_loader):
177
+ suppress_conf = model(input_data.cuda())
178
+
179
+ for b in range(0,input_data.size(0)):
180
+ video_name, idx = dataset.inputs[n_iter*opt['batch_size']+b]
181
+ output[video_name]+=[suppress_conf[b,:].detach().cpu().numpy()]
182
+ labels[video_name]+=[label[b,:].numpy()]
183
+
184
+ for video_name in dataset.video_list:
185
+ labels[video_name]=np.stack(labels[video_name], axis=0)
186
+ output[video_name]=np.stack(output[video_name], axis=0)
187
+
188
+ outfile = h5py.File(opt['suppress_result_file'].format(opt['exp']), 'w')
189
+
190
+ for video_name in dataset.video_list:
191
+ o=output[video_name]
192
+ l=labels[video_name]
193
+
194
+ dset_pred = outfile.create_dataset(video_name+'/pred', o.shape, maxshape=o.shape, chunks=True, dtype=np.float32)
195
+ dset_pred[:,:] = o[:,:]
196
+ dset_label = outfile.create_dataset(video_name+'/label', l.shape, maxshape=l.shape, chunks=True, dtype=np.float32)
197
+ dset_label[:,:] = l[:,:]
198
+ outfile.close()
199
+ print('complete')
200
+
201
+
202
+ def make_dataset(opt):
203
+
204
+ model = MYNET(opt).cuda()
205
+ checkpoint = torch.load(opt["checkpoint_path"]+"/"+opt['exp']+"_ckp_best.pth.tar")
206
+ base_dict=checkpoint['state_dict']
207
+ model.load_state_dict(base_dict)
208
+ model.eval()
209
+
210
+ dataset = VideoDataSet(opt,subset=opt['inference_subset'])
211
+
212
+ _, _, _, output_cls, output_reg, labels_cls, labels_reg, _, _ = eval_frame(opt, model,dataset)
213
+
214
+ proposal_dict=[]
215
+
216
+ outfile = h5py.File(opt['suppress_label_file'].format(opt['inference_subset']+'_'+opt['setup']), 'w')
217
+
218
+ num_class = opt["num_of_class"]-1
219
+ unit_size = opt['segment_size']
220
+ threshold=opt['threshold']
221
+ anchors=opt['anchors']
222
+
223
+ for video_name in dataset.video_list:
224
+ duration = dataset.video_len[video_name]
225
+
226
+ for idx in range(0,duration):
227
+ cls_anc = output_cls[video_name][idx]
228
+ reg_anc = output_reg[video_name][idx]
229
+
230
+ proposal_anc_dict=[]
231
+ for anc_idx in range(0,len(anchors)):
232
+ cls = np.argwhere(cls_anc[anc_idx][:-1]>opt['threshold']).reshape(-1)
233
+
234
+ if len(cls) == 0:
235
+ continue
236
+
237
+ ed= idx + anchors[anc_idx] * reg_anc[anc_idx][0]
238
+ length = anchors[anc_idx]* np.exp(reg_anc[anc_idx][1])
239
+ st= ed-length
240
+
241
+ for cidx in range(0,len(cls)):
242
+ label=cls[cidx]
243
+ tmp_dict={}
244
+ tmp_dict["segment"] = [st, ed]
245
+ tmp_dict["score"]= cls_anc[anc_idx][label]
246
+ tmp_dict["label"]=label
247
+ tmp_dict["gentime"]= idx
248
+ proposal_anc_dict.append(tmp_dict)
249
+
250
+ proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
251
+ proposal_dict+=proposal_anc_dict
252
+
253
+ nms_dict=non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
254
+
255
+ input_table = np.zeros((duration,unit_size,num_class), dtype=np.float32)
256
+ label_table = np.zeros((duration,num_class), dtype=np.float32)
257
+
258
+ for proposal in proposal_dict:
259
+ idx = proposal["gentime"]
260
+ conf = proposal["score"]
261
+ cls = proposal["label"]
262
+ for i in range(0,unit_size):
263
+ if idx+i < duration:
264
+ input_table[idx+i,unit_size-1-i,cls]=conf
265
+
266
+ for proposal in nms_dict:
267
+ idx = proposal["gentime"]
268
+ cls = proposal["label"]
269
+ label_table[idx:idx+3,cls]=1
270
+
271
+ dset_input_table = outfile.create_dataset(video_name+'/input', input_table.shape, maxshape=input_table.shape, chunks=True, dtype=np.float32)
272
+ dset_label_table = outfile.create_dataset(video_name+'/label', label_table.shape, maxshape=label_table.shape, chunks=True, dtype=np.float32)
273
+
274
+ dset_input_table[:]=input_table
275
+ dset_label_table[:]=label_table
276
+
277
+ proposal_dict=[]
278
+
279
+ print('complete')
280
+ return
281
+
282
+
283
+ def main(opt):
284
+ if opt['mode'] == 'train':
285
+ train(opt)
286
+ if opt['mode'] == 'test':
287
+ test(opt)
288
+ if opt['mode'] == 'make':
289
+ make_dataset(opt)
290
+
291
+ return
292
+
293
+ if __name__ == '__main__':
294
+ opt = opts.parse_opt()
295
+ opt = vars(opt)
296
+ if not os.path.exists(opt["checkpoint_path"]):
297
+ os.makedirs(opt["checkpoint_path"])
298
+ opt_file=open(opt["checkpoint_path"]+"/"+opt['exp']+"_opts.json","w")
299
+ json.dump(opt,opt_file)
300
+ opt_file.close()
301
+
302
+ if opt['seed'] >= 0:
303
+ seed = opt['seed']
304
+ torch.manual_seed(seed)
305
+ np.random.seed(seed)
306
+ #random.seed(seed)
307
+
308
+ opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
309
+
310
+ main(opt)
311
+ while(opt['wterm']):
312
+ pass
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+ # import os
322
+ # import json
323
+ # import torch
324
+ # import torchvision
325
+ # import torch.nn.parallel
326
+ # import torch.nn.functional as F
327
+ # import torch.optim as optim
328
+ # import numpy as np
329
+ # # import opts_egtea as opts
330
+ # import opts_thumos as opts
331
+ # import time
332
+ # import h5py
333
+ # from iou_utils import *
334
+ # from eval import evaluation_detection
335
+ # from tensorboardX import SummaryWriter
336
+ # from dataset import VideoDataSet, SuppressDataSet
337
+ # from models import MYNET, SuppressNet
338
+ # from loss_func import cls_loss_func, regress_loss_func, suppress_loss_func
339
+ # from tqdm import tqdm
340
+
341
+ # def train_one_epoch(opt, model, train_dataset, optimizer):
342
+ # train_loader = torch.utils.data.DataLoader(train_dataset,
343
+ # batch_size=opt['batch_size'], shuffle=True,
344
+ # num_workers=0, pin_memory=True,drop_last=False)
345
+ # epoch_cost = 0
346
+
347
+ # for n_iter,(input_data,label) in enumerate(tqdm(train_loader)):
348
+ # suppress_conf = model(input_data.cuda())
349
+
350
+ # loss = suppress_loss_func(label,suppress_conf)
351
+ # epoch_cost+= loss.detach().cpu().numpy()
352
+
353
+ # optimizer.zero_grad()
354
+ # loss.backward()
355
+ # optimizer.step()
356
+
357
+ # return n_iter, epoch_cost
358
+
359
+ # def eval_one_epoch(opt, model, test_dataset):
360
+ # test_loader = torch.utils.data.DataLoader(test_dataset,
361
+ # batch_size=opt['batch_size'], shuffle=False,
362
+ # num_workers=0, pin_memory=True,drop_last=False)
363
+ # epoch_cost = 0
364
+
365
+ # for n_iter,(input_data,label) in enumerate(tqdm(test_loader)):
366
+ # suppress_conf = model(input_data.cuda())
367
+
368
+ # loss = suppress_loss_func(label,suppress_conf)
369
+ # epoch_cost+= loss.detach().cpu().numpy()
370
+
371
+ # return n_iter, epoch_cost
372
+
373
+
374
+ # def train(opt):
375
+ # writer = SummaryWriter()
376
+ # model = SuppressNet(opt).cuda()
377
+
378
+ # optimizer = optim.Adam( model.parameters(),lr=opt["lr"],weight_decay = opt["weight_decay"])
379
+ # scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = opt["lr_step"])
380
+
381
+ # train_dataset = SuppressDataSet(opt,subset="train")
382
+ # test_dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
383
+
384
+ # for n_epoch in range(opt['epoch']):
385
+ # n_iter, epoch_cost = train_one_epoch(opt, model, train_dataset, optimizer)
386
+
387
+ # writer.add_scalars('sup_data/cost', {'train': epoch_cost/(n_iter+1)}, n_epoch)
388
+ # print("training loss(epoch %d): %f, lr - %f"%(n_epoch,
389
+ # epoch_cost/(n_iter+1),
390
+ # optimizer.param_groups[0]["lr"]) )
391
+
392
+ # scheduler.step()
393
+ # model.eval()
394
+
395
+ # n_iter, eval_cost = eval_one_epoch(opt, model,test_dataset)
396
+
397
+ # writer.add_scalars('sup_data/eval', {'test': eval_cost/(n_iter+1)}, n_epoch)
398
+ # print("testing loss(epoch %d): %f"%(n_epoch,eval_cost/(n_iter+1)))
399
+
400
+ # state = {'epoch': n_epoch + 1,
401
+ # 'state_dict': model.state_dict()}
402
+ # torch.save(state, opt["checkpoint_path"]+"/checkpoint_suppress_"+str(n_epoch+1)+".pth.tar" )
403
+ # if eval_cost < model.best_loss:
404
+ # model.best_loss = eval_cost
405
+ # torch.save(state, opt["checkpoint_path"]+"/ckp_best_suppress.pth.tar" )
406
+
407
+ # model.train()
408
+
409
+ # writer.close()
410
+ # return
411
+
412
+ # def eval_frame(opt, model, dataset):
413
+ # test_loader = torch.utils.data.DataLoader(dataset,
414
+ # batch_size=opt['batch_size'], shuffle=False,
415
+ # num_workers=0, pin_memory=True,drop_last=False)
416
+
417
+ # labels_cls={}
418
+ # labels_reg={}
419
+ # output_cls={}
420
+ # output_reg={}
421
+ # for video_name in dataset.video_list:
422
+ # labels_cls[video_name]=[]
423
+ # labels_reg[video_name]=[]
424
+ # output_cls[video_name]=[]
425
+ # output_reg[video_name]=[]
426
+
427
+ # start_time = time.time()
428
+ # total_frames =0
429
+ # epoch_cost = 0
430
+ # epoch_cost_cls = 0
431
+ # epoch_cost_reg = 0
432
+
433
+ # for n_iter,(input_data,cls_label,reg_label, _) in enumerate(tqdm(test_loader)):
434
+ # act_cls, act_reg, _ = model(input_data.cuda())
435
+
436
+ # cost_reg = 0
437
+ # cost_cls = 0
438
+
439
+ # loss = cls_loss_func(cls_label,act_cls)
440
+ # cost_cls = loss
441
+
442
+ # epoch_cost_cls+= cost_cls.detach().cpu().numpy()
443
+
444
+ # loss = regress_loss_func(reg_label,act_reg)
445
+ # cost_reg = loss
446
+ # epoch_cost_reg += cost_reg.detach().cpu().numpy()
447
+
448
+ # cost= opt['alpha']*cost_cls +opt['beta']*cost_reg
449
+
450
+ # epoch_cost += cost.detach().cpu().numpy()
451
+
452
+ # act_cls = torch.softmax(act_cls, dim=-1)
453
+
454
+ # total_frames+=input_data.size(0)
455
+
456
+ # for b in range(0,input_data.size(0)):
457
+ # video_name, st, ed, data_idx = dataset.inputs[n_iter*opt['batch_size']+b]
458
+ # output_cls[video_name]+=[act_cls[b,:].detach().cpu().numpy()]
459
+ # output_reg[video_name]+=[act_reg[b,:].detach().cpu().numpy()]
460
+ # labels_cls[video_name]+=[cls_label[b,:].numpy()]
461
+ # labels_reg[video_name]+=[reg_label[b,:].numpy()]
462
+
463
+ # end_time = time.time()
464
+ # working_time = end_time-start_time
465
+
466
+ # for video_name in dataset.video_list:
467
+ # labels_cls[video_name]=np.stack(labels_cls[video_name], axis=0)
468
+ # labels_reg[video_name]=np.stack(labels_reg[video_name], axis=0)
469
+ # output_cls[video_name]=np.stack(output_cls[video_name], axis=0)
470
+ # output_reg[video_name]=np.stack(output_reg[video_name], axis=0)
471
+
472
+ # cls_loss=epoch_cost_cls/n_iter
473
+ # reg_loss=epoch_cost_reg/n_iter
474
+ # tot_loss=epoch_cost/n_iter
475
+
476
+ # return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
477
+
478
+
479
+ # def test(opt):
480
+ # model = SuppressNet(opt).cuda()
481
+ # checkpoint = torch.load(opt["checkpoint_path"]+"/" + opt['exp'] + "ckp_best_suppress.pth.tar")
482
+ # base_dict=checkpoint['state_dict']
483
+ # model.load_state_dict(base_dict)
484
+ # model.eval()
485
+
486
+ # dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
487
+
488
+ # test_loader = torch.utils.data.DataLoader(dataset,
489
+ # batch_size=opt['batch_size'], shuffle=False,
490
+ # num_workers=0, pin_memory=True,drop_last=False)
491
+ # labels={}
492
+ # output={}
493
+ # for video_name in dataset.video_list:
494
+ # labels[video_name]=[]
495
+ # output[video_name]=[]
496
+
497
+ # for n_iter,(input_data,label) in enumerate(test_loader):
498
+ # suppress_conf = model(input_data.cuda())
499
+
500
+ # for b in range(0,input_data.size(0)):
501
+ # video_name, idx = dataset.inputs[n_iter*opt['batch_size']+b]
502
+ # output[video_name]+=[suppress_conf[b,:].detach().cpu().numpy()]
503
+ # labels[video_name]+=[label[b,:].numpy()]
504
+
505
+ # for video_name in dataset.video_list:
506
+ # labels[video_name]=np.stack(labels[video_name], axis=0)
507
+ # output[video_name]=np.stack(output[video_name], axis=0)
508
+
509
+ # outfile = h5py.File(opt['suppress_result_file'].format(opt['exp']), 'w')
510
+
511
+ # for video_name in dataset.video_list:
512
+ # o=output[video_name]
513
+ # l=labels[video_name]
514
+
515
+ # dset_pred = outfile.create_dataset(video_name+'/pred', o.shape, maxshape=o.shape, chunks=True, dtype=np.float32)
516
+ # dset_pred[:,:] = o[:,:]
517
+ # dset_label = outfile.create_dataset(video_name+'/label', l.shape, maxshape=l.shape, chunks=True, dtype=np.float32)
518
+ # dset_label[:,:] = l[:,:]
519
+ # outfile.close()
520
+ # print('complete')
521
+
522
+
523
+ # def make_dataset(opt):
524
+
525
+ # model = MYNET(opt).cuda()
526
+ # checkpoint = torch.load(opt["checkpoint_path"]+"/"+opt['exp']+"_ckp_best.pth.tar")
527
+ # base_dict=checkpoint['state_dict']
528
+ # model.load_state_dict(base_dict)
529
+ # model.eval()
530
+
531
+ # # Fix: Set the 'split' key to match 'inference_subset'
532
+ # opt['split'] = opt['inference_subset']
533
+
534
+ # dataset = VideoDataSet(opt,subset=opt['inference_subset'])
535
+
536
+ # _, _, _, output_cls, output_reg, labels_cls, labels_reg, _, _ = eval_frame(opt, model,dataset)
537
+
538
+ # proposal_dict=[]
539
+
540
+ # outfile = h5py.File(opt['suppress_label_file'].format(opt['inference_subset']+'_'+opt['setup']), 'w')
541
+
542
+ # num_class = opt["num_of_class"]-1
543
+ # unit_size = opt['segment_size']
544
+ # threshold=opt['threshold']
545
+ # anchors=opt['anchors']
546
+
547
+ # for video_name in dataset.video_list:
548
+ # duration = dataset.video_len[video_name]
549
+
550
+ # for idx in range(0,duration):
551
+ # cls_anc = output_cls[video_name][idx]
552
+ # reg_anc = output_reg[video_name][idx]
553
+
554
+ # proposal_anc_dict=[]
555
+ # for anc_idx in range(0,len(anchors)):
556
+ # cls = np.argwhere(cls_anc[anc_idx][:-1]>opt['threshold']).reshape(-1)
557
+
558
+ # if len(cls) == 0:
559
+ # continue
560
+
561
+ # ed= idx + anchors[anc_idx] * reg_anc[anc_idx][0]
562
+ # length = anchors[anc_idx]* np.exp(reg_anc[anc_idx][1])
563
+ # st= ed-length
564
+
565
+ # for cidx in range(0,len(cls)):
566
+ # label=cls[cidx]
567
+ # tmp_dict={}
568
+ # tmp_dict["segment"] = [st, ed]
569
+ # tmp_dict["score"]= cls_anc[anc_idx][label]
570
+ # tmp_dict["label"]=label
571
+ # tmp_dict["gentime"]= idx
572
+ # proposal_anc_dict.append(tmp_dict)
573
+
574
+ # proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
575
+ # proposal_dict+=proposal_anc_dict
576
+
577
+ # nms_dict=non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
578
+
579
+ # input_table = np.zeros((duration,unit_size,num_class), dtype=np.float32)
580
+ # label_table = np.zeros((duration,num_class), dtype=np.float32)
581
+
582
+ # for proposal in proposal_dict:
583
+ # idx = proposal["gentime"]
584
+ # conf = proposal["score"]
585
+ # cls = proposal["label"]
586
+ # for i in range(0,unit_size):
587
+ # if idx+i < duration:
588
+ # input_table[idx+i,unit_size-1-i,cls]=conf
589
+
590
+ # for proposal in nms_dict:
591
+ # idx = proposal["gentime"]
592
+ # cls = proposal["label"]
593
+ # label_table[idx:idx+3,cls]=1
594
+
595
+ # dset_input_table = outfile.create_dataset(video_name+'/input', input_table.shape, maxshape=input_table.shape, chunks=True, dtype=np.float32)
596
+ # dset_label_table = outfile.create_dataset(video_name+'/label', label_table.shape, maxshape=label_table.shape, chunks=True, dtype=np.float32)
597
+
598
+ # dset_input_table[:]=input_table
599
+ # dset_label_table[:]=label_table
600
+
601
+ # proposal_dict=[]
602
+
603
+ # outfile.close() # Added missing close() call
604
+ # print('complete')
605
+ # return
606
+
607
+
608
+ # def main(opt):
609
+ # if opt['mode'] == 'train':
610
+ # train(opt)
611
+ # if opt['mode'] == 'test':
612
+ # test(opt)
613
+ # if opt['mode'] == 'make':
614
+ # make_dataset(opt)
615
+
616
+ # return
617
+
618
+ # if __name__ == '__main__':
619
+ # opt = opts.parse_opt()
620
+ # opt = vars(opt)
621
+ # if not os.path.exists(opt["checkpoint_path"]):
622
+ # os.makedirs(opt["checkpoint_path"])
623
+ # opt_file=open(opt["checkpoint_path"]+"/"+opt['exp']+"_opts.json","w")
624
+ # json.dump(opt,opt_file)
625
+ # opt_file.close()
626
+
627
+ # if opt['seed'] >= 0:
628
+ # seed = opt['seed']
629
+ # torch.manual_seed(seed)
630
+ # np.random.seed(seed)
631
+ # #random.seed(seed)
632
+
633
+ # opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
634
+
635
+ # main(opt)
636
+ # while(opt['wterm']):
637
+ # pass