Spaces:

Darknsu
/

SAT

Build error

App Files Files Community

Darknsu commited on Jun 21, 2025

Commit

e1fb045

verified ·

1 Parent(s): a53e7ee

Upload 25 files

Browse files

Files changed (26) hide show

.gitattributes +2 -0
Evaluation/__pycache__/eval_detection_gentime.cpython-310.pyc +0 -0
Evaluation/__pycache__/utils.cpython-310.pyc +0 -0
Evaluation/eval_detection_gentime.py +566 -0
Evaluation/utils.py +76 -0
checkpoint/README.md +1 -0
data/Poppins Black Italic 900.ttf +3 -0
data/Poppins ExtraBold Italic 800.ttf +3 -0
data/egtea_annotations_split1.json +0 -0
data/egtea_annotations_split2.json +0 -0
data/egtea_annotations_split3.json +0 -0
data/egtea_annotations_split4.json +0 -0
data/test_video_annotations.json +0 -0
data/thumos14_v2.json +0 -0
data/thumos14_v2_small.json +0 -0
dataset.py +533 -0
eval.py +39 -0
feature_extractor.py +29 -0
iou_utils.py +65 -0
loss_func.py +374 -0
models.py +232 -0
opts_egtea.py +62 -0
output/README.md +1 -0
requirements.txt +5 -0
short main.py +0 -0
supnet.py +637 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/Poppins[[:space:]]Black[[:space:]]Italic[[:space:]]900.ttf filter=lfs diff=lfs merge=lfs -text
+data/Poppins[[:space:]]ExtraBold[[:space:]]Italic[[:space:]]800.ttf filter=lfs diff=lfs merge=lfs -text

Evaluation/__pycache__/eval_detection_gentime.cpython-310.pyc ADDED Viewed

Binary file (7.8 kB). View file

Evaluation/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.49 kB). View file

Evaluation/eval_detection_gentime.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import json
+#import urllib.request, urllib.error, urllib.parse
+import numpy as np
+import pandas as pd
+from utils import get_blocked_videos
+from utils import interpolated_prec_rec
+from utils import segment_iou
+class ANETdetection(object):
+    GROUND_TRUTH_FIELDS = ['database']#, 'taxonomy', 'version']
+    PREDICTION_FIELDS = ['results', 'version', 'external_data']
+    def __init__(self, opt, ground_truth_filename=None, prediction_filename=None,
+                 ground_truth_fields=GROUND_TRUTH_FIELDS,
+                 prediction_fields=PREDICTION_FIELDS,
+                 tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                 subset='validation', verbose=False,
+                 check_status=True):
+        if not ground_truth_filename:
+            raise IOError('Please input a valid ground truth file.')
+        if not prediction_filename:
+            raise IOError('Please input a valid prediction file.')
+        self.subset = subset
+        self.tiou_thresholds = tiou_thresholds
+        self.verbose = verbose
+        self.gt_fields = ground_truth_fields
+        self.pred_fields = prediction_fields
+        self.ap = None
+        self.tdiff = None
+        self.check_status = check_status
+        self.num_class = opt["num_of_class"]
+        # Retrieve blocked videos from server.
+        if self.check_status:
+            self.blocked_videos = get_blocked_videos()
+        else:
+            self.blocked_videos = list()
+        # Import ground truth and predictions.
+        self.ground_truth, self.activity_index, cidx = self._import_ground_truth(
+            ground_truth_filename)
+        self.prediction = self._import_prediction(prediction_filename, cidx)
+        if self.verbose:
+            print('[INIT] Loaded annotations from {} subset.'.format(subset))
+            nr_gt = len(self.ground_truth)
+            print('\tNumber of ground truth instances: {}'.format(nr_gt))
+            nr_pred = len(self.prediction)
+            print('\tNumber of predictions: {}'.format(nr_pred))
+            print('\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds))
+    def _import_ground_truth(self, ground_truth_filename):
+        """Reads ground truth file, checks if it is well formatted, and returns
+           the ground truth instances and the activity classes.
+        Parameters
+        ----------
+        ground_truth_filename : str
+            Full path to the ground truth json file.
+        Outputs
+        -------
+        ground_truth : df
+            Data frame containing the ground truth instances.
+        activity_index : dict
+            Dictionary containing class index.
+        """
+        with open(ground_truth_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format
+        if not all([field in list(data.keys()) for field in self.gt_fields]):
+            raise IOError('Please input a valid ground truth file.')
+        # Read ground truth data.
+        activity_index, cidx = {}, 0
+        video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
+        for videoid, v in data['database'].items():
+            if self.subset not in v['subset']:
+                continue
+            for ann in v['annotations']:
+                if ann['label'] not in activity_index:
+                    activity_index[ann['label']] = cidx
+                    cidx += 1
+                video_lst.append(videoid)
+                t_start_lst.append(ann['segment'][0])
+                t_end_lst.append(ann['segment'][1])
+                label_lst.append(activity_index[ann['label']])
+        ground_truth = pd.DataFrame({'video-id': video_lst,
+                                     't-start': t_start_lst,
+                                     't-end': t_end_lst,
+                                     'label': label_lst})
+        return ground_truth, activity_index, cidx
+    def _import_prediction(self, prediction_filename, cidx):
+        """Reads prediction file, checks if it is well formatted, and returns
+           the prediction instances.
+        Parameters
+        ----------
+        prediction_filename : str
+            Full path to the prediction json file.
+        Outputs
+        -------
+        prediction : df
+            Data frame containing the prediction instances.
+        """
+        with open(prediction_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format...
+        if not all([field in list(data.keys()) for field in self.pred_fields]):
+            raise IOError('Please input a valid prediction file.')
+        # Read predicitons.
+        video_lst, t_start_lst, t_end_lst = [], [], []
+        label_lst, score_lst = [], []
+        gentime_lst = []
+        for videoid, v in data['results'].items():
+            if videoid in self.blocked_videos:
+                continue
+            for result in v:
+                if result['label'] not in self.activity_index.keys():
+                    continue
+                label = self.activity_index[result['label']]
+                video_lst.append(videoid)
+                t_start_lst.append(result['segment'][0])
+                t_end_lst.append(result['segment'][1])
+                label_lst.append(label)
+                score_lst.append(result['score'])
+                gentime_lst.append(result['gentime'])
+        prediction = pd.DataFrame({'video-id': video_lst,
+                                   't-start': t_start_lst,
+                                   't-end': t_end_lst,
+                                   'label': label_lst,
+                                   'score': score_lst,
+                                   'gentime': gentime_lst})
+        return prediction
+    def wrapper_compute_average_precision(self):
+        """Computes average precision for each class in the subset.
+        """
+        ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
+        tdiff = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
+        cnt_tp = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
+        for activity, cidx in self.activity_index.items():
+            gt_idx = self.ground_truth['label'] == cidx
+            pred_idx = self.prediction['label'] == cidx
+            ap[:,cidx], tdiff[:,cidx], cnt_tp[:,cidx] = compute_average_precision_detection(
+                self.ground_truth.loc[gt_idx].reset_index(drop=True),
+                self.prediction.loc[pred_idx].reset_index(drop=True),
+                tiou_thresholds=self.tiou_thresholds)
+        sum_tdiff = np.sum(tdiff, axis=1)
+        total_tp = np.sum(cnt_tp, axis=1)
+        final_tdiff = sum_tdiff/total_tp
+        return ap, final_tdiff
+    def evaluate(self):
+        """Evaluates a prediction file. For the detection task we measure the
+        interpolated mean average precision to measure the performance of a
+        method.
+        """
+        self.ap, self.tdiff = self.wrapper_compute_average_precision()
+        self.mAP = self.ap.mean(axis=1)
+        if self.verbose:
+            print('[RESULTS] Performance on ActivityNet detection task.')
+            print('\tAverage-mAP: {}'.format(self.mAP.mean()))
+            print('\tAverage-time diff: {}'.format(self.tdiff.mean()))
+def compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
+    """Compute average precision (detection task) between ground truth and
+    predictions data frames. If multiple predictions occurs for the same
+    predicted segment, only the one with highest score is matches as
+    true positive. This code is greatly inspired by Pascal VOC devkit.
+    Parameters
+    ----------
+    ground_truth : df
+        Data frame containing the ground truth instances.
+        Required fields: ['video-id', 't-start', 't-end']
+    prediction : df
+        Data frame containing the prediction instances.
+        Required fields: ['video-id, 't-start', 't-end', 'score']
+    tiou_thresholds : 1darray, optional
+        Temporal intersection over union threshold.
+    Outputs
+    -------
+    ap : float
+        Average precision score.
+    """
+    npos = float(len(ground_truth))
+    lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1
+    # Sort predictions by decreasing score order.
+    sort_idx = prediction['score'].values.argsort()[::-1]
+    prediction = prediction.loc[sort_idx].reset_index(drop=True)
+    # Initialize true positive and false positive vectors.
+    tp = np.zeros((len(tiou_thresholds), len(prediction)))
+    fp = np.zeros((len(tiou_thresholds), len(prediction)))
+    timediff = np.zeros((len(tiou_thresholds), len(prediction)))
+    # Adaptation to query faster
+    ground_truth_gbvn = ground_truth.groupby('video-id')
+    # Assigning true positive to truly grount truth instances.
+    for idx, this_pred in prediction.iterrows():
+        try:
+            # Check if there is at least one ground truth in the video associated.
+            ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id'])
+        except Exception as e:
+            fp[:, idx] = 1
+            continue
+        this_gt = ground_truth_videoid.reset_index()
+        tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values,
+                               this_gt[['t-start', 't-end']].values)
+        gentime_pred_arr= this_pred['gentime']
+        gentime_gt_arr = this_gt['t-end'].values
+        tiou_sorted_idx = tiou_arr.argsort()[::-1]
+        for tidx, tiou_thr in enumerate(tiou_thresholds):
+            for jdx in tiou_sorted_idx:
+                if tiou_arr[jdx] < tiou_thr:
+                    fp[tidx, idx] = 1
+                    break
+                if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0:
+                    continue
+                # Assign as true positive after the filters above.
+                tp[tidx, idx] = 1
+                timediff[tidx, idx]=(gentime_pred_arr-gentime_gt_arr[jdx])#/len_gt_arr[jdx]
+                lock_gt[tidx, this_gt.loc[jdx]['index']] = idx
+                break
+            if fp[tidx, idx] == 0 and tp[tidx, idx] == 0:
+                fp[tidx, idx] = 1
+    ap = np.zeros(len(tiou_thresholds))
+    tdiff = np.zeros(len(tiou_thresholds))
+    cnt_tp = np.zeros(len(tiou_thresholds))
+    for tidx in range(len(tiou_thresholds)):
+        # Computing prec-rec
+        this_tp = np.cumsum(tp[tidx,:]).astype(float)
+        this_fp = np.cumsum(fp[tidx,:]).astype(float)
+        # print(this_tp, npos)
+        rec = this_tp / npos
+        prec = this_tp / (this_tp + this_fp)
+        # print('###', rec, prec)
+        ap[tidx] = interpolated_prec_rec(prec, rec)
+        this_tdiff=np.cumsum(timediff[tidx,:]).astype(float)
+        if len(this_tdiff)==0:
+            continue
+        tdiff[tidx]=this_tdiff[-1]#  / max(1,this_tp[-1])
+        cnt_tp[tidx]=this_tp[-1]
+    return ap,tdiff, cnt_tp
+# import json
+# #import urllib.request, urllib.error, urllib.parse
+# import numpy as np
+# import pandas as pd
+# from utils import get_blocked_videos
+# from utils import interpolated_prec_rec
+# from utils import segment_iou
+# class ANETdetection(object):
+#     GROUND_TRUTH_FIELDS = ['database']#, 'taxonomy', 'version']
+#     PREDICTION_FIELDS = ['results', 'version', 'external_data']
+#     def __init__(self, opt, ground_truth_filename=None, prediction_filename=None,
+#                  ground_truth_fields=GROUND_TRUTH_FIELDS,
+#                  prediction_fields=PREDICTION_FIELDS,
+#                  tiou_thresholds=np.linspace(0.5, 0.95, 10),
+#                  subset='validation', verbose=False,
+#                  check_status=True):
+#         if not ground_truth_filename:
+#             raise IOError('Please input a valid ground truth file.')
+#         if not prediction_filename:
+#             raise IOError('Please input a valid prediction file.')
+#         self.subset = subset
+#         self.tiou_thresholds = tiou_thresholds
+#         self.verbose = verbose
+#         self.gt_fields = ground_truth_fields
+#         self.pred_fields = prediction_fields
+#         self.ap = None
+#         self.tdiff = None
+#         self.check_status = check_status
+#         self.num_class = opt["num_of_class"]
+#         # Retrieve blocked videos from server.
+#         if self.check_status:
+#             self.blocked_videos = get_blocked_videos()
+#         else:
+#             self.blocked_videos = list()
+#         # Import ground truth and predictions.
+#         self.ground_truth, self.activity_index, cidx = self._import_ground_truth(
+#             ground_truth_filename)
+#         self.prediction = self._import_prediction(prediction_filename, cidx)
+#         if self.verbose:
+#             print('[INIT] Loaded annotations from {} subset.'.format(subset))
+#             nr_gt = len(self.ground_truth)
+#             print('\tNumber of ground truth instances: {}'.format(nr_gt))
+#             nr_pred = len(self.prediction)
+#             print('\tNumber of predictions: {}'.format(nr_pred))
+#             print('\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds))
+#     def _import_ground_truth(self, ground_truth_filename):
+#         """Reads ground truth file, checks if it is well formatted, and returns
+#            the ground truth instances and the activity classes.
+#         Parameters
+#         ----------
+#         ground_truth_filename : str
+#             Full path to the ground truth json file.
+#         Outputs
+#         -------
+#         ground_truth : df
+#             Data frame containing the ground truth instances.
+#         activity_index : dict
+#             Dictionary containing class index.
+#         """
+#         with open(ground_truth_filename, 'r') as fobj:
+#             data = json.load(fobj)
+#         # Checking format
+#         if not all([field in list(data.keys()) for field in self.gt_fields]):
+#             raise IOError('Please input a valid ground truth file.')
+#         # Read ground truth data.
+#         activity_index, cidx = {}, 0
+#         video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
+#         for videoid, v in data['database'].items():
+#             if self.subset not in v['subset']:
+#                 continue
+#             for ann in v['annotations']:
+#                 if ann['label'] not in activity_index:
+#                     activity_index[ann['label']] = cidx
+#                     cidx += 1
+#                 video_lst.append(videoid)
+#                 t_start_lst.append(ann['segment'][0])
+#                 t_end_lst.append(ann['segment'][1])
+#                 label_lst.append(activity_index[ann['label']])
+#         ground_truth = pd.DataFrame({'video-id': video_lst,
+#                                      't-start': t_start_lst,
+#                                      't-end': t_end_lst,
+#                                      'label': label_lst})
+#         return ground_truth, activity_index, cidx
+#     def _import_prediction(self, prediction_filename, cidx):
+#         """Reads prediction file, checks if it is well formatted, and returns
+#            the prediction instances.
+#         Parameters
+#         ----------
+#         prediction_filename : str
+#             Full path to the prediction json file.
+#         Outputs
+#         -------
+#         prediction : df
+#             Data frame containing the prediction instances.
+#         """
+#         with open(prediction_filename, 'r') as fobj:
+#             data = json.load(fobj)
+#         # Checking format...
+#         if not all([field in list(data.keys()) for field in self.pred_fields]):
+#             raise IOError('Please input a valid prediction file.')
+#         # Read predicitons.
+#         video_lst, t_start_lst, t_end_lst = [], [], []
+#         label_lst, score_lst = [], []
+#         gentime_lst = []
+#         for videoid, v in data['results'].items():
+#             if videoid in self.blocked_videos:
+#                 continue
+#             for result in v:
+#                 if result['label'] not in self.activity_index.keys():
+#                     continue
+#                 label = self.activity_index[result['label']]
+#                 video_lst.append(videoid)
+#                 t_start_lst.append(result['segment'][0])
+#                 t_end_lst.append(result['segment'][1])
+#                 label_lst.append(label)
+#                 score_lst.append(result['score'])
+#                 gentime_lst.append(result['gentime'])
+#         prediction = pd.DataFrame({'video-id': video_lst,
+#                                    't-start': t_start_lst,
+#                                    't-end': t_end_lst,
+#                                    'label': label_lst,
+#                                    'score': score_lst,
+#                                    'gentime': gentime_lst})
+#         return prediction
+#     def wrapper_compute_average_precision(self):
+#         """Computes average precision for each class in the subset.
+#         """
+#         ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
+#         tdiff = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
+#         cnt_tp = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
+#         for activity, cidx in self.activity_index.items():
+#             gt_idx = self.ground_truth['label'] == cidx
+#             pred_idx = self.prediction['label'] == cidx
+#             ap[:,cidx], tdiff[:,cidx], cnt_tp[:,cidx] = compute_average_precision_detection(
+#                 self.ground_truth.loc[gt_idx].reset_index(drop=True),
+#                 self.prediction.loc[pred_idx].reset_index(drop=True),
+#                 tiou_thresholds=self.tiou_thresholds)
+#         sum_tdiff = np.sum(tdiff, axis=1)
+#         total_tp = np.sum(cnt_tp, axis=1)
+#         # FIX: Handle division by zero
+#         final_tdiff = np.zeros_like(total_tp)
+#         valid_mask = total_tp > 0
+#         final_tdiff[valid_mask] = sum_tdiff[valid_mask] / total_tp[valid_mask]
+#         # For cases where total_tp is 0, keep final_tdiff as 0
+#         return ap, final_tdiff
+#     def evaluate(self):
+#         """Evaluates a prediction file. For the detection task we measure the
+#         interpolated mean average precision to measure the performance of a
+#         method.
+#         """
+#         self.ap, self.tdiff = self.wrapper_compute_average_precision()
+#         self.mAP = self.ap.mean(axis=1)
+#         if self.verbose:
+#             print('[RESULTS] Performance on ActivityNet detection task.')
+#             print('\tAverage-mAP: {}'.format(self.mAP.mean()))
+#             print('\tAverage-time diff: {}'.format(self.tdiff.mean()))
+# def compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
+#     """Compute average precision (detection task) between ground truth and
+#     predictions data frames. If multiple predictions occurs for the same
+#     predicted segment, only the one with highest score is matches as
+#     true positive. This code is greatly inspired by Pascal VOC devkit.
+#     Parameters
+#     ----------
+#     ground_truth : df
+#         Data frame containing the ground truth instances.
+#         Required fields: ['video-id', 't-start', 't-end']
+#     prediction : df
+#         Data frame containing the prediction instances.
+#         Required fields: ['video-id, 't-start', 't-end', 'score']
+#     tiou_thresholds : 1darray, optional
+#         Temporal intersection over union threshold.
+#     Outputs
+#     -------
+#     ap : float
+#         Average precision score.
+#     """
+#     npos = float(len(ground_truth))
+#     lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1
+#     # Sort predictions by decreasing score order.
+#     sort_idx = prediction['score'].values.argsort()[::-1]
+#     prediction = prediction.loc[sort_idx].reset_index(drop=True)
+#     # Initialize true positive and false positive vectors.
+#     tp = np.zeros((len(tiou_thresholds), len(prediction)))
+#     fp = np.zeros((len(tiou_thresholds), len(prediction)))
+#     timediff = np.zeros((len(tiou_thresholds), len(prediction)))
+#     # Adaptation to query faster
+#     ground_truth_gbvn = ground_truth.groupby('video-id')
+#     # Assigning true positive to truly grount truth instances.
+#     for idx, this_pred in prediction.iterrows():
+#         try:
+#             # Check if there is at least one ground truth in the video associated.
+#             ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id'])
+#         except Exception as e:
+#             fp[:, idx] = 1
+#             continue
+#         this_gt = ground_truth_videoid.reset_index()
+#         tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values,
+#                                this_gt[['t-start', 't-end']].values)
+#         gentime_pred_arr= this_pred['gentime']
+#         gentime_gt_arr = this_gt['t-end'].values
+#         tiou_sorted_idx = tiou_arr.argsort()[::-1]
+#         for tidx, tiou_thr in enumerate(tiou_thresholds):
+#             for jdx in tiou_sorted_idx:
+#                 if tiou_arr[jdx] < tiou_thr:
+#                     fp[tidx, idx] = 1
+#                     break
+#                 if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0:
+#                     continue
+#                 # Assign as true positive after the filters above.
+#                 tp[tidx, idx] = 1
+#                 # FIX: Add safety check for NaN/Inf values
+#                 time_diff = gentime_pred_arr - gentime_gt_arr[jdx]
+#                 if np.isfinite(time_diff):
+#                     timediff[tidx, idx] = time_diff
+#                 else:
+#                     timediff[tidx, idx] = 0.0  # Default value for invalid time differences
+#                 lock_gt[tidx, this_gt.loc[jdx]['index']] = idx
+#                 break
+#             if fp[tidx, idx] == 0 and tp[tidx, idx] == 0:
+#                 fp[tidx, idx] = 1
+#     ap = np.zeros(len(tiou_thresholds))
+#     tdiff = np.zeros(len(tiou_thresholds))
+#     cnt_tp = np.zeros(len(tiou_thresholds))
+#     for tidx in range(len(tiou_thresholds)):
+#         # Computing prec-rec
+#         this_tp = np.cumsum(tp[tidx,:]).astype(float)
+#         this_fp = np.cumsum(fp[tidx,:]).astype(float)
+#         # Handle edge cases
+#         if npos == 0:
+#             ap[tidx] = 0.0
+#             tdiff[tidx] = 0.0
+#             cnt_tp[tidx] = 0.0
+#             continue
+#         rec = this_tp / npos
+#         # FIX: Handle division by zero in precision calculation
+#         denominator = this_tp + this_fp
+#         prec = np.zeros_like(this_tp)
+#         valid_mask = denominator > 0
+#         prec[valid_mask] = this_tp[valid_mask] / denominator[valid_mask]
+#         ap[tidx] = interpolated_prec_rec(prec, rec)
+#         # FIX: Handle time difference calculation more safely
+#         this_tdiff = np.cumsum(timediff[tidx,:]).astype(float)
+#         if len(this_tdiff) == 0 or this_tp[-1] == 0:
+#             tdiff[tidx] = 0.0
+#         else:
+#             tdiff[tidx] = this_tdiff[-1]
+#         cnt_tp[tidx] = this_tp[-1] if len(this_tp) > 0 else 0.0
+#     return ap, tdiff, cnt_tp

Evaluation/utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+#import urllib.request, urllib.error, urllib.parse
+import numpy as np
+API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'
+def get_blocked_videos(api=API):
+#    api_url = '{}?action=get_blocked'.format(api)
+#    req = urllib.request.Request(api_url)
+#    response = urllib.request.urlopen(req)
+#    return json.loads(response.read())
+    return list()
+def interpolated_prec_rec(prec, rec):
+    """Interpolated AP - VOCdevkit from VOC 2011.
+    """
+    mprec = np.hstack([[0], prec, [0]])
+    mrec = np.hstack([[0], rec, [1]])
+    for i in range(len(mprec) - 1)[::-1]:
+        mprec[i] = max(mprec[i], mprec[i + 1])
+    idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1
+    ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprec[idx])
+    return ap
+def segment_iou(target_segment, candidate_segments):
+    """Compute the temporal intersection over union between a
+    target segment and all the test segments.
+    Parameters
+    ----------
+    target_segment : 1d array
+        Temporal target segment containing [starting, ending] times.
+    candidate_segments : 2d array
+        Temporal candidate segments containing N x [starting, ending] times.
+    Outputs
+    -------
+    tiou : 1d array
+        Temporal intersection over union score of the N's candidate segments.
+    """
+    tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
+    tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
+    # Intersection including Non-negative overlap score.
+    segments_intersection = (tt2 - tt1).clip(0)
+    # Segment union.
+    segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
+      + (target_segment[1] - target_segment[0]) - segments_intersection
+    # Compute overlap as the ratio of the intersection
+    # over union of two segments.
+    tIoU = segments_intersection.astype(float) / segments_union
+    return tIoU
+def wrapper_segment_iou(target_segments, candidate_segments):
+    """Compute intersection over union btw segments
+    Parameters
+    ----------
+    target_segments : ndarray
+        2-dim array in format [m x 2:=[init, end]]
+    candidate_segments : ndarray
+        2-dim array in format [n x 2:=[init, end]]
+    Outputs
+    -------
+    tiou : ndarray
+        2-dim array [n x m] with IOU ratio.
+    Note: It assumes that candidate-segments are more scarce that target-segments
+    """
+    if candidate_segments.ndim != 2 or target_segments.ndim != 2:
+        raise ValueError('Dimension of arguments is incorrect')
+    n, m = candidate_segments.shape[0], target_segments.shape[0]
+    tiou = np.empty((n, m))
+    for i in range(m):
+        tiou[:, i] = segment_iou(target_segments[i,:], candidate_segments)
+    return tiou

checkpoint/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Please put the model files in this folder.

data/Poppins Black Italic 900.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d56d2b8ff884cfae1b637e73a71f3caf1d16cdb5b4acc123d9cd0b5864ca2567
+size 156916

data/Poppins ExtraBold Italic 800.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db8f803d5aaf8e646fd868d0a897ed9997985b88c931bfae3e08c7c8dc2556be
+size 158896

data/egtea_annotations_split1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/egtea_annotations_split2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/egtea_annotations_split3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/egtea_annotations_split4.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/test_video_annotations.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/thumos14_v2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/thumos14_v2_small.json ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import numpy as np
+import h5py
+import json
+import torch
+import torch.utils.data as data
+import os
+import pickle
+from multiprocessing import Pool
+def load_json(file):
+    with open(file) as json_file:
+        data = json.load(json_file)
+        return data
+def calc_iou(a, b):
+    st = a[0] - a[1]
+    ed = a[0]
+    target_st = b[0] - b[1]
+    target_ed = b[0]
+    sst = min(st, target_st)
+    led = max(ed, target_ed)
+    lst = max(st, target_st)
+    sed = min(ed, target_ed)
+    iou = (sed - lst) / max(led - sst, 1)
+    return iou
+def box_include(y, target):
+    st = y[0] - y[1]
+    ed = y[0]
+    target_st = target[0] - target[1]
+    target_ed = target[0]
+    detection_point = target_st
+    if ed > detection_point and target_st < st and target_ed > ed:
+        return True
+    return False
+class VideoDataSet(data.Dataset):
+    def __init__(self, opt, subset="train", video_name=None):
+        self.subset = subset
+        self.mode = opt["mode"]
+        self.predefined_fps = opt["predefined_fps"]
+        self.video_anno_path = opt["video_anno"].format(opt["split"])
+        self.video_len_path = opt["video_len_file"].format(self.subset + '_' + opt["setup"])
+        self.num_of_class = opt["num_of_class"]
+        self.segment_size = opt["segment_size"]
+        self.label_name = []
+        self.match_score = {}
+        self.match_score_end = {}
+        self.match_length = {}
+        self.gt_action = {}
+        self.cls_label = {}
+        self.reg_label = {}
+        self.snip_label = {}
+        self.inputs = []
+        self.inputs_all = []
+        self.data_rescale = opt["data_rescale"]
+        self.anchors = opt["anchors"]
+        self.pos_threshold = opt["pos_threshold"]
+        self.single_video_name = video_name
+        self._getDatasetDict()
+        self._loadFeaturelen(opt)
+        self._getMatchScore()
+        self._makeInputSeq()
+        self._loadPropLabel(opt['proposal_label_file'].format(self.subset + '_' + opt["setup"]))
+        if self.subset == "train":
+            if opt['data_format'] == "h5":
+                feature_rgb_file = h5py.File(opt["video_feature_rgb_train"], 'r')
+                self.feature_rgb_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_rgb_file:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_train']}")
+                    self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
+                if opt['rgb_only']:
+                    self.feature_flow_file = None
+                else:
+                    self.feature_flow_file = {}
+                    feature_flow_file = h5py.File(opt["video_feature_flow_train"], 'r')
+                    for vidx in range(len(keys)):
+                        if keys[vidx] not in feature_flow_file:
+                            raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_train']}")
+                        self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
+            elif opt['data_format'] == "pickle":
+                feature_All = pickle.load(open(opt["video_feature_all_train"], 'rb'))
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_All:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_train']}")
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "npz":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)['feats']
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+            elif opt['data_format'] == "npz_i3d":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "pt":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.pt'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = torch.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+        else:
+            if opt['data_format'] == "h5":
+                feature_rgb_file = h5py.File(opt["video_feature_rgb_test"], 'r')
+                self.feature_rgb_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_rgb_file:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_test']}")
+                    self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
+                if opt['rgb_only']:
+                    self.feature_flow_file = None
+                else:
+                    self.feature_flow_file = {}
+                    feature_flow_file = h5py.File(opt["video_feature_flow_test"], 'r')
+                    for vidx in range(len(keys)):
+                        if keys[vidx] not in feature_flow_file:
+                            raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_test']}")
+                        self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
+            elif opt['data_format'] == "pickle":
+                feature_All = pickle.load(open(opt["video_feature_all_test"], 'rb'))
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_All:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_test']}")
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "npz":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)['feats']
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+            elif opt['data_format'] == "npz_i3d":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "pt":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.pt'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = torch.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+    def _loadFeaturelen(self, opt):
+        if os.path.exists(self.video_len_path):
+            self.video_len = load_json(self.video_len_path)
+            return
+        self.video_len = {}
+        if self.subset == "train":
+            if opt['data_format'] == "h5":
+                feature_file = h5py.File(opt["video_feature_rgb_train"], 'r')
+            elif opt['data_format'] == "pickle":
+                feature_file = pickle.load(open(opt["video_feature_all_train"], 'rb'))
+            elif opt['data_format'] == "npz":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')['feats']
+            elif opt['data_format'] == "npz_i3d":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')
+            elif opt['data_format'] == "pt":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = torch.load(opt["video_feature_all_train"] + file + '.pt')
+        else:
+            if opt['data_format'] == "h5":
+                feature_file = h5py.File(opt["video_feature_rgb_test"], 'r')
+            elif opt['data_format'] == "pickle":
+                feature_file = pickle.load(open(opt["video_feature_all_test"], 'rb'))
+            elif opt['data_format'] == "npz":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')['feats']
+            elif opt['data_format'] == "npz_i3d":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')
+            elif opt['data_format'] == "pt":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = torch.load(opt["video_feature_all_test"] + file + '.pt')
+        keys = self.video_list
+        if opt['data_format'] == "h5":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        elif opt['data_format'] == "pickle":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
+        elif opt['data_format'] == "npz":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        elif opt['data_format'] == "npz_i3d":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
+        elif opt['data_format'] == "pt":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        outfile = open(self.video_len_path, "w")
+        json.dump(self.video_len, outfile, indent=2)
+        outfile.close()
+    def _getDatasetDict(self):
+        anno_database = load_json(self.video_anno_path)
+        anno_database = anno_database['database']
+        self.video_dict = {}
+        if self.single_video_name:
+            if self.single_video_name in anno_database:
+                video_info = anno_database[self.single_video_name]
+                video_subset = video_info['subset']
+                if self.subset == "full" or self.subset in video_subset:
+                    self.video_dict[self.single_video_name] = video_info
+                for seg in video_info['annotations']:
+                    if not seg['label'] in self.label_name:
+                        self.label_name.append(seg['label'])
+            else:
+                raise ValueError(f"Video {self.single_video_name} not found in annotation database")
+        else:
+            for video_name in anno_database:
+                video_info = anno_database[video_name]
+                video_subset = anno_database[video_name]['subset']
+                if self.subset == "full" or self.subset in video_subset:
+                    self.video_dict[video_name] = video_info
+                for seg in video_info['annotations']:
+                    if not seg['label'] in self.label_name:
+                        self.label_name.append(seg['label'])
+        # Ensure all 22 EGTEA action classes are included
+        expected_labels = [
+            'Clean/Wipe', 'Close', 'Compress', 'Crack', 'Cut', 'Divide/Pull Apart',
+            'Dry', 'Inspect/Read', 'Mix', 'Move Around', 'Open', 'Operate', 'Other',
+            'Pour', 'Put', 'Squeeze', 'Take', 'Transfer', 'Turn off', 'Turn on', 'Wash',
+            'Spread'  # Assumed missing label; replace with actual label if known
+        ]
+        for label in expected_labels:
+            if label not in self.label_name:
+                self.label_name.append(label)
+        self.label_name.sort()
+        self.video_list = list(self.video_dict.keys())
+        print(f"Labels in dataset.label_name: {self.label_name}")
+        print(f"Number of labels: {len(self.label_name)}, Expected: {self.num_of_class-1}")
+        print(f"{self.subset} subset video numbers: {len(self.video_list)}")
+    def _getMatchScore(self):
+        self.action_end_count = torch.zeros(2)
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            video_info = self.video_dict[video_name]
+            video_labels = video_info['annotations']
+            gt_bbox = []
+            gt_edlen = []
+            second_to_frame = self.video_len[video_name] / float(video_info['duration'])
+            for j in range(len(video_labels)):
+                tmp_info = video_labels[j]
+                tmp_start = tmp_info['segment'][0] * second_to_frame
+                tmp_end = tmp_info['segment'][1] * second_to_frame
+                tmp_label = self.label_name.index(tmp_info['label'])
+                gt_bbox.append([tmp_start, tmp_end, tmp_label])
+                gt_edlen.append([gt_bbox[-1][1], gt_bbox[-1][1] - gt_bbox[-1][0], tmp_label])
+            gt_bbox = np.array(gt_bbox)
+            gt_edlen = np.array(gt_edlen)
+            self.gt_action[video_name] = gt_edlen
+            match_score = np.zeros((self.video_len[video_name], self.num_of_class - 1), dtype=np.float32)
+            for idx in range(gt_bbox.shape[0]):
+                ed = int(gt_bbox[idx, 1]) + 1
+                st = int(gt_bbox[idx, 0])
+                match_score[st:ed, int(gt_bbox[idx, 2])] = idx + 1
+            self.match_score[video_name] = match_score
+    def _makeInputSeq(self):
+        data_idx = 0
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            duration = self.match_score[video_name].shape[0]
+            for i in range(1, duration + 1):
+                st = i - self.segment_size
+                ed = i
+                self.inputs_all.append([video_name, st, ed, data_idx])
+                data_idx += 1
+        self.inputs = self.inputs_all.copy()
+        print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
+    def _makePropLabelUnit(self, i):
+        video_name = self.inputs_all[i][0]
+        st = self.inputs_all[i][1]
+        ed = self.inputs_all[i][2]
+        cls_anc = []
+        reg_anc = []
+        for j in range(0, len(self.anchors)):
+            v1 = np.zeros(self.num_of_class)
+            v1[-1] = 1
+            v2 = np.zeros(2)
+            v2[-1] = -1e3
+            y_box = [ed - 1, self.anchors[j]]
+            subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[j], ed)
+            idx_list = []
+            for ii in range(0, subset_label.shape[0]):
+                for jj in range(0, subset_label.shape[1]):
+                    idx = int(subset_label[ii, jj])
+                    if idx > 0 and idx - 1 not in idx_list:
+                        idx_list.append(idx - 1)
+            for idx in idx_list:
+                target_box = self.gt_action[video_name][idx]
+                cls = int(target_box[2])
+                iou = calc_iou(y_box, target_box)
+                if iou >= self.pos_threshold or (j == len(self.anchors) - 1 and box_include(y_box, target_box)) or (j == 0 and box_include(target_box, y_box)):
+                    v1[cls] = 1
+                    v1[-1] = 0
+                    v2[0] = 1.0 * (target_box[0] - y_box[0]) / self.anchors[j]
+                    v2[1] = np.log(1.0 * max(1, target_box[1]) / y_box[1])
+            cls_anc.append(v1)
+            reg_anc.append(v2)
+        v0 = np.zeros(self.num_of_class)
+        v0[-1] = 1
+        segment_size = ed - st
+        y_box = [ed - 1, self.anchors[-1]]
+        subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[-1], ed)
+        idx_list = []
+        for ii in range(0, subset_label.shape[0]):
+            for jj in range(0, subset_label.shape[1]):
+                idx = int(subset_label[ii, jj])
+                if idx > 0 and idx - 1 not in idx_list:
+                    idx_list.append(idx - 1)
+        for idx in idx_list:
+            target_box = self.gt_action[video_name][idx]
+            cls = int(target_box[2])
+            iou = calc_iou(y_box, target_box)
+            if iou >= 0:
+                v0[cls] = 1
+                v0[-1] = 0
+        cls_anc = np.stack(cls_anc, axis=0)
+        reg_anc = np.stack(reg_anc, axis=0)
+        cls_snip = np.array(v0)
+        return cls_anc, reg_anc, cls_snip
+    def _loadPropLabel(self, filename):
+        if os.path.exists(filename):
+            prop_label_file = h5py.File(filename, 'r')
+            self.cls_label = np.array(prop_label_file['cls_label'][:])
+            self.reg_label = np.array(prop_label_file['reg_label'][:])
+            self.snip_label = np.array(prop_label_file['snip_label'][:])
+            prop_label_file.close()
+            self.action_frame_count = np.sum(self.cls_label.reshape((-1, self.cls_label.shape[-1])), axis=0)
+            self.action_frame_count = torch.Tensor(self.action_frame_count)
+            return
+        pool = Pool(os.cpu_count() // 2)
+        labels = pool.map(self._makePropLabelUnit, range(0, len(self.inputs_all)))
+        pool.close()
+        pool.join()
+        cls_label = []
+        reg_label = []
+        snip_label = []
+        for i in range(0, len(labels)):
+            cls_label.append(labels[i][0])
+            reg_label.append(labels[i][1])
+            snip_label.append(labels[i][2])
+        self.cls_label = np.stack(cls_label, axis=0)
+        self.reg_label = np.stack(reg_label, axis=0)
+        self.snip_label = np.stack(snip_label, axis=0)
+        outfile = h5py.File(filename, 'w')
+        dset_cls = outfile.create_dataset('/cls_label', self.cls_label.shape, maxshape=self.cls_label.shape, chunks=True, dtype=np.float32)
+        dset_cls[:, :] = self.cls_label[:, :]
+        dset_reg = outfile.create_dataset('/reg_label', self.reg_label.shape, maxshape=self.reg_label.shape, chunks=True, dtype=np.float32)
+        dset_reg[:, :] = self.reg_label[:, :]
+        dset_snip = outfile.create_dataset('/snip_label', self.snip_label.shape, maxshape=self.snip_label.shape, chunks=True, dtype=np.float32)
+        dset_snip[:, :] = self.snip_label[:, :]
+        outfile.close()
+        return
+    def __getitem__(self, index):
+        video_name, st, ed, data_idx = self.inputs[index]
+        if st >= 0:
+            feature = self._get_base_data(video_name, st, ed)
+        else:
+            feature = self._get_base_data(video_name, 0, ed)
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, -st, 0), 0)
+            feature = padfunc2d(feature)
+        cls_label = torch.Tensor(self.cls_label[data_idx])
+        reg_label = torch.Tensor(self.reg_label[data_idx])
+        snip_label = torch.Tensor(self.snip_label[data_idx])
+        return feature, cls_label, reg_label, snip_label
+    def _get_base_data(self, video_name, st, ed):
+        feature_rgb = self.feature_rgb_file[video_name]
+        feature_rgb = feature_rgb[st:ed, :]
+        if self.feature_flow_file is not None:
+            feature_flow = self.feature_flow_file[video_name]
+            feature_flow = feature_flow[st:ed, :]
+            feature = np.append(feature_rgb, feature_flow, axis=1)
+        else:
+            feature = feature_rgb
+        feature = torch.from_numpy(np.array(feature))
+        return feature
+    def _get_train_label_with_class(self, video_name, st, ed):
+        duration = len(self.match_score[video_name])
+        st_padding = 0
+        ed_padding = 0
+        if st < 0:
+            st_padding = -st
+            st = 0
+        if ed > duration:
+            ed_padding = ed - duration
+            ed = duration
+        match_score = torch.Tensor(self.match_score[video_name][st:ed])
+        if st_padding > 0:
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, st_padding, 0), 0)
+            match_score = padfunc2d(match_score)
+        if ed_padding > 0:
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, 0, ed_padding), 0)
+            match_score = padfunc2d(match_score)
+        return match_score
+    def __len__(self):
+        return len(self.inputs)
+    def reset_sample(self):
+        self.inputs = self.inputs_all.copy()
+    def select_sample(self, idx):
+        inputs = [self.inputs_all[i] for i in idx]
+        self.inputs = inputs.copy()
+        return
+class SuppressDataSet(data.Dataset):
+    def __init__(self, opt, subset="train"):
+        self.subset = subset
+        self.mode = opt["mode"]
+        self.data_file = h5py.File(opt["suppress_label_file"].format(self.subset + "_" + opt['setup']), 'r')
+        self.video_list = list(self.data_file.keys())
+        self.inputs = []
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            duration = self.data_file[video_name + '/input'].shape[0]
+            for i in range(0, duration):
+                self.inputs.append([video_name, i])
+        print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
+    def __getitem__(self, index):
+        video_name, idx = self.inputs[index]
+        input_seq = self.data_file[video_name + '/input'][idx]
+        label = self.data_file[video_name + '/label'][idx]
+        input_seq = torch.from_numpy(input_seq)
+        label = torch.from_numpy(label)
+        return input_seq, label
+    def __len__(self):
+        return len(self.inputs)

eval.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# -*- coding: utf-8 -*-
+import sys
+sys.path.append('./Evaluation')
+from eval_detection_gentime import ANETdetection
+import matplotlib.pyplot as plt
+import numpy as np
+def run_evaluation_detection(opt, ground_truth_filename, prediction_filename,
+                   tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                   subset='validation', verbose=True):
+    anet_detection = ANETdetection(opt, ground_truth_filename, prediction_filename,
+                                   subset=subset, tiou_thresholds=tiou_thresholds,
+                                   verbose=verbose, check_status=False)
+    anet_detection.evaluate()
+    ap = anet_detection.ap
+    mAP = anet_detection.mAP
+    tdiff = anet_detection.tdiff
+    return (mAP, ap, tdiff)
+def evaluation_detection(opt, verbose=True):
+    mAP, AP, tdiff = run_evaluation_detection(
+        opt,
+        opt["video_anno"].format(opt["split"]),
+        opt["result_file"].format(opt['exp']),
+        tiou_thresholds=np.linspace(0.1, 0.50, 5),
+        subset=opt['inference_subset'], verbose=verbose)
+    if verbose:
+        print('mAP')
+        print(mAP)
+        print('AEDT')
+        print(tdiff)
+    return mAP

feature_extractor.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from models.i3d.extract_i3d import ExtractI3D
+from utils.utils import build_cfg_path
+from omegaconf import OmegaConf
+import torch
+from tqdm import tqdm
+import os
+import numpy as np
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(torch.cuda.get_device_name(0))
+# Select the feature type
+feature_type = 'i3d'
+# Load and patch the config
+args = OmegaConf.load(build_cfg_path(feature_type))
+args.step_size = 12
+args.flow_type = 'raft' # 'pwc'
+# Load the model
+extractor = ExtractI3D(args)
+args.video_paths = os.listdir('./Videos')
+# Extract features
+for video_path in tqdm(args.video_paths):
+    print(f'Extracting for {video_path}')
+    feature_dict = extractor.extract('./Videos/'+video_path)
+    np.savez('./I3D/'+video_path[:-4]+'.npz', **feature_dict)
+    [(print(k), print(v.shape)) for k, v in feature_dict.items()]

iou_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+def non_max_suppression(proposals, overlapThresh=0.3):
+    # if there are no intervals, return an empty list
+    if len(proposals) == 0:
+        return []
+    # initialize the list of picked indexes
+    pick = []
+    sorted_proposal = sorted(proposals, key=lambda proposal:proposal['score'], reverse=True)
+    idx=0
+    total_proposal= len(sorted_proposal)
+    while idx < total_proposal:
+        proposal = sorted_proposal[idx]
+        st = proposal['segment'][0]
+        ed = proposal['segment'][1]
+        label = proposal['label']
+        delete_item = []
+        for j in range(idx+1, total_proposal):
+            target_proposal = sorted_proposal[j]
+            target_st = target_proposal['segment'][0]
+            target_ed = target_proposal['segment'][1]
+            target_label = target_proposal['label']
+            if(label == target_label):
+                sst = np.minimum(st, target_st)
+                led = np.maximum(ed, target_ed)
+                lst = np.maximum(st, target_st)
+                sed = np.minimum(ed, target_ed)
+                iou = (sed-lst) / max(led-sst,1)
+                if(iou > overlapThresh):
+                    delete_item.append(target_proposal)
+        for item in delete_item:
+            sorted_proposal.remove(item)
+        total_proposal=len(sorted_proposal)
+        idx+=1
+    return sorted_proposal
+def check_overlap_proposal(proposal_list, new_proposal, overlapThresh=0.3):
+    for proposal in proposal_list:
+        st = proposal['segment'][0]
+        ed = proposal['segment'][1]
+        label = proposal['label']
+        new_st = new_proposal['segment'][0]
+        new_ed = new_proposal['segment'][1]
+        new_label = new_proposal['label']
+        if(label == new_label):
+            sst = np.minimum(st, new_st)
+            led = np.maximum(ed, new_ed)
+            lst = np.maximum(st, new_st)
+            sed = np.minimum(ed, new_ed)
+            iou = (sed-lst) / max(led-sst,1)
+            if(iou > overlapThresh):
+                return proposal
+    return None

loss_func.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from functools import partial
+class MultiCrossEntropyLoss(nn.Module):
+    def __init__(self, focal=False, weight=None, reduce=True):
+        super(MultiCrossEntropyLoss, self).__init__()
+        self.num_classes = 23
+        self.focal = focal
+        self.weight= weight
+        self.reduce = reduce
+        self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
+        self.gamma_f = 0.05
+        self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
+        self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
+        self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
+    def forward(self, input, target):
+        target_sum = torch.sum(target, dim=1)
+        target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
+        target = target/target_div
+        logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
+        gamma = self.gamma_.clone()
+        gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
+        if not self.focal:
+            if self.weight is None:
+                output = torch.sum(-target * logsoftmax(input), 1)
+            else:
+                output = torch.sum(-target * logsoftmax(input) /self.weight, 1)
+        else:
+            softmax = nn.Softmax(dim=1).to(input.device)
+            p = softmax(input)
+            output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
+        if self.reduce:
+            return torch.mean(output)
+        else:
+            return output
+    def map_func(self, x, s):
+        min_val = torch.min(x)
+        max_val = torch.max(x)
+        mu = torch.mean(x)
+        x = (x - min_val) / (max_val - min_val)
+        return 1 / (1 + torch.exp(-s * (x - mu)))
+    def collect_grad(self, target, grad):
+        grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
+        target = target.reshape(-1, target.shape[-1]).cuda()
+        pos_grad = torch.sum(grad * target, dim=0)[:-1]
+        neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
+        self.pos_grad += pos_grad
+        self.neg_grad += neg_grad
+        self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
+        self.pos_neg = self.map_func(self.pos_neg, 1)
+def cls_loss_func(y,output, use_focal=False, weight=None, reduce=True):
+    input_size=y.size()
+    y = y.float().cuda()
+    if weight is not None:
+        weight = weight.cuda()
+    loss_func = MultiCrossEntropyLoss(focal=True, weight=weight, reduce=reduce)
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    loss = loss_func(output,y)
+    if not reduce:
+        loss = loss.reshape(input_size[:-1])
+    return loss
+def cls_loss_func_(loss_func, y,output, use_focal=False, weight=None, reduce=True):
+    input_size=y.size()
+    y = y.float().cuda()
+    if weight is not None:
+        weight = weight.cuda()
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    loss = loss_func(output,y)
+    if not reduce:
+        loss = loss.reshape(input_size[:-1])
+    return loss
+def regress_loss_func(y,output):
+    y = y.float().cuda()
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    bgmask= y[:,1] < -1e2
+    fg_logits = output[~bgmask]
+    bg_logits = output[bgmask]
+    fg_target = y[~bgmask]
+    bg_target = y[bgmask]
+    loss = nn.functional.l1_loss(fg_logits,fg_target)
+    if(loss.isnan()):
+        return torch.tensor([0.0], requires_grad=True).cuda()
+    return loss
+def suppress_loss_func(y,output):
+    y = y.float().cuda()
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    loss = nn.functional.binary_cross_entropy(output,y)
+    return loss
+# import torch
+# import numpy as np
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import torch.distributed as dist
+# from functools import partial
+# class MultiCrossEntropyLoss(nn.Module):
+#     def __init__(self, focal=False, weight=None, reduce=True):
+#         super(MultiCrossEntropyLoss, self).__init__()
+#         self.num_classes = 23
+#         self.focal = focal
+#         self.weight= weight
+#         self.reduce = reduce
+#         self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
+#         self.gamma_f = 0.05
+#         self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
+#     def forward(self, input, target):
+#         target_sum = torch.sum(target, dim=1)
+#         target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
+#         target = target/target_div
+#         logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
+#         gamma = self.gamma_.clone()
+#         gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
+#         if not self.focal:
+#             if self.weight is None:
+#                 output = torch.sum(-target * logsoftmax(input), 1)
+#             else:
+#                 output = torch.sum(-target * logsoftmax(input) /self.weight, 1)
+#         else:
+#             softmax = nn.Softmax(dim=1).to(input.device)
+#             p = softmax(input)
+#             output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
+#         if self.reduce:
+#             return torch.mean(output)
+#         else:
+#             return output
+#     def map_func(self, x, s):
+#         min_val = torch.min(x)
+#         max_val = torch.max(x)
+#         mu = torch.mean(x)
+#         x = (x - min_val) / (max_val - min_val)
+#         return 1 / (1 + torch.exp(-s * (x - mu)))
+#     def collect_grad(self, target, grad):
+#         grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
+#         target = target.reshape(-1, target.shape[-1]).cuda()
+#         pos_grad = torch.sum(grad * target, dim=0)[:-1]
+#         neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
+#         self.pos_grad += pos_grad
+#         self.neg_grad += neg_grad
+#         self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
+#         self.pos_neg = self.map_func(self.pos_neg, 1)
+# def cls_loss_func(y,output, use_focal=False, weight=None, reduce=True):
+#     input_size=y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     loss_func = MultiCrossEntropyLoss(focal=True, weight=weight, reduce=reduce)
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     loss = loss_func(output,y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def cls_loss_func_(loss_func, y,output, use_focal=False, weight=None, reduce=True):
+#     input_size=y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     loss = loss_func(output,y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def regress_loss_func(y,output):
+#     y = y.float().cuda()
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     bgmask= y[:,1] < -1e2
+#     fg_logits = output[~bgmask]
+#     bg_logits = output[bgmask]
+#     fg_target = y[~bgmask]
+#     bg_target = y[bgmask]
+#     loss = nn.functional.l1_loss(fg_logits,fg_target)
+#     if(loss.isnan()):
+#         return torch.tensor([0.0], requires_grad=True).cuda()
+#     return loss
+# def suppress_loss_func(y,output):
+#     y = y.float().cuda()
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     loss = nn.functional.binary_cross_entropy(output,y)
+#     return loss
+# import torch
+# import numpy as np
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import torch.distributed as dist
+# from functools import partial
+# class MultiCrossEntropyLoss(nn.Module):
+#     def __init__(self, num_classes, focal=False, weight=None, reduce=True):
+#         super(MultiCrossEntropyLoss, self).__init__()
+#         self.num_classes = num_classes  # Use the provided num_classes
+#         self.focal = focal
+#         self.weight = weight
+#         self.reduce = reduce
+#         self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
+#         self.gamma_f = 0.05
+#         self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
+#     def forward(self, input, target):
+#         target_sum = torch.sum(target, dim=1)
+#         target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
+#         target = target / target_div
+#         logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
+#         gamma = self.gamma_.clone()
+#         gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
+#         if not self.focal:
+#             if self.weight is None:
+#                 output = torch.sum(-target * logsoftmax(input), 1)
+#             else:
+#                 output = torch.sum(-target * logsoftmax(input) / self.weight, 1)
+#         else:
+#             softmax = nn.Softmax(dim=1).to(input.device)
+#             p = softmax(input)
+#             output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
+#         if self.reduce:
+#             return torch.mean(output)
+#         else:
+#             return output
+#     def map_func(self, x, s):
+#         min_val = torch.min(x)
+#         max_val = torch.max(x)
+#         mu = torch.mean(x)
+#         x = (x - min_val) / (max_val - min_val)
+#         return 1 / (1 + torch.exp(-s * (x - mu)))
+#     def collect_grad(self, target, grad):
+#         grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
+#         target = target.reshape(-1, target.shape[-1]).cuda()
+#         pos_grad = torch.sum(grad * target, dim=0)[:-1]
+#         neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
+#         self.pos_grad += pos_grad
+#         self.neg_grad += neg_grad
+#         self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
+#         self.pos_neg = self.map_func(self.pos_neg, 1)
+# def cls_loss_func(y, output, use_focal=False, weight=None, reduce=True):
+#     input_size = y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     loss_func = MultiCrossEntropyLoss(num_classes=y.size(-1), focal=use_focal, weight=weight, reduce=reduce)
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     loss = loss_func(output, y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def cls_loss_func_(loss_func, y, output, use_focal=False, weight=None, reduce=True):
+#     input_size = y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     loss = loss_func(output, y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def regress_loss_func(y, output):
+#     y = y.float().cuda()
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     bgmask = y[:, 1] < -1e2
+#     fg_logits = output[~bgmask]
+#     bg_logits = output[bgmask]
+#     fg_target = y[~bgmask]
+#     bg_target = y[bgmask]
+#     loss = nn.functional.l1_loss(fg_logits, fg_target)
+#     if loss.isnan():
+#         return torch.tensor([0.0], requires_grad=True).cuda()
+#     return loss
+# def suppress_loss_func(y, output):
+#     y = y.float().cuda()
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     loss = nn.functional.binary_cross_entropy(output, y)
+#     return loss

models.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import numpy as np
+import torch
+import math
+from torch.autograd import Variable
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import init
+from torch.nn.functional import normalize
+class PositionalEncoding(nn.Module):
+    def __init__(self,
+                 emb_size: int,
+                 dropout: float = 0.1,
+                 maxlen: int = 750):
+        super(PositionalEncoding, self).__init__()
+        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
+        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
+        pos_embedding = torch.zeros((maxlen, emb_size))
+        pos_embedding[:, 0::2] = torch.sin(pos * den)
+        pos_embedding[:, 1::2] = torch.cos(pos * den)
+        pos_embedding = pos_embedding.unsqueeze(-2)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer('pos_embedding', pos_embedding)
+    def forward(self, token_embedding: torch.Tensor):
+        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
+class HistoryUnit(torch.nn.Module):
+    def __init__(self, opt):
+        super(HistoryUnit, self).__init__()
+        self.n_feature=opt["feat_dim"]
+        n_class=opt["num_of_class"]
+        n_embedding_dim=opt["hidden_dim"]
+        n_hist_dec_head = 4
+        n_hist_dec_layer = 5
+        n_hist_dec_head_2 = 4
+        n_hist_dec_layer_2 = 2
+        self.anchors=opt["anchors"]
+        self.history_tokens = 16
+        self.short_window_size = 16
+        self.anchors_stride=[]
+        dropout=0.3
+        self.best_loss=1000000
+        self.best_map=0
+        self.history_positional_encoding = PositionalEncoding(n_embedding_dim, dropout, maxlen=400)
+        self.history_encoder_block1 = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_hist_dec_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_hist_dec_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.history_encoder_block2 = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_hist_dec_head_2,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_hist_dec_layer_2,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.snip_head = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim//4), nn.ReLU())
+        self.snip_classifier = nn.Sequential(nn.Linear(self.history_tokens*n_embedding_dim//4, (self.history_tokens*n_embedding_dim//4)//4), nn.ReLU(), nn.Linear((self.history_tokens*n_embedding_dim//4)//4,n_class))
+        self.history_token = nn.Parameter(torch.zeros(self.history_tokens, 1, n_embedding_dim))
+        # self.history_token_extra = nn.Parameter(torch.zeros(self.history_tokens*2, 1, n_embedding_dim))
+        self.norm2 = nn.LayerNorm(n_embedding_dim)
+        self.dropout2 = nn.Dropout(0.1)
+    def forward(self, long_x, encoded_x):
+        ## History Encoder
+        hist_pe_x = self.history_positional_encoding(long_x)
+        history_token = self.history_token.expand(-1, hist_pe_x.shape[1], -1)
+        hist_encoded_x_1 = self.history_encoder_block1(history_token, hist_pe_x)
+        hist_encoded_x_2 = self.history_encoder_block2(hist_encoded_x_1, encoded_x)
+        hist_encoded_x_2 = hist_encoded_x_2 + self.dropout2(hist_encoded_x_1)
+        hist_encoded_x = self.norm2(hist_encoded_x_2)
+        ## Snippet Classfication Head
+        snippet_feat = self.snip_head(hist_encoded_x_1)
+        snippet_feat = torch.flatten(snippet_feat.permute(1, 0, 2), start_dim=1)
+        snip_cls = self.snip_classifier(snippet_feat)
+        return hist_encoded_x, snip_cls
+class MYNET(torch.nn.Module):
+    def __init__(self, opt):
+        super(MYNET, self).__init__()
+        self.n_feature=opt["feat_dim"]
+        n_class=opt["num_of_class"]
+        n_embedding_dim=opt["hidden_dim"]
+        n_enc_layer=opt["enc_layer"]
+        n_enc_head=opt["enc_head"]
+        n_dec_layer=opt["dec_layer"]
+        n_dec_head=opt["dec_head"]
+        n_comb_dec_head = 4
+        n_comb_dec_layer = 5
+        n_seglen=opt["segment_size"]
+        self.anchors=opt["anchors"]
+        self.history_tokens = 16
+        self.short_window_size = 16
+        self.anchors_stride=[]
+        dropout=0.3
+        self.best_loss=1000000
+        self.best_map=0
+        self.feature_reduction_rgb = nn.Linear(self.n_feature//2, n_embedding_dim//2)
+        self.feature_reduction_flow = nn.Linear(self.n_feature//2, n_embedding_dim//2)
+        self.positional_encoding = PositionalEncoding(n_embedding_dim, dropout, maxlen=400)
+        self.encoder = nn.TransformerEncoder(
+                                            nn.TransformerEncoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_enc_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_enc_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.decoder = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_dec_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_dec_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.history_unit = HistoryUnit(opt)
+        self.history_anchor_decoder_block1 = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_comb_dec_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_comb_dec_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.classifier = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim), nn.ReLU(), nn.Linear(n_embedding_dim,n_class))
+        self.regressor = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim), nn.ReLU(), nn.Linear(n_embedding_dim,2))
+        self.decoder_token = nn.Parameter(torch.zeros(len(self.anchors), 1, n_embedding_dim))
+        self.norm1 = nn.LayerNorm(n_embedding_dim)
+        self.dropout1 = nn.Dropout(0.1)
+        self.relu = nn.ReLU(True)
+        self.softmaxd1 = nn.Softmax(dim=-1)
+    def forward(self, inputs):
+        # base_x_rgb = self.feature_reduction_rgb(inputs[:,:,:self.n_feature//2])
+        # base_x_flow = self.feature_reduction_flow(inputs[:,:,self.n_feature//2:])
+        base_x_rgb = self.feature_reduction_rgb(inputs[:,:,:self.n_feature//2].float())
+        base_x_flow = self.feature_reduction_flow(inputs[:,:,self.n_feature//2:].float())
+        base_x = torch.cat([base_x_rgb,base_x_flow],dim=-1)
+        base_x = base_x.permute([1,0,2])# seq_len x batch x featsize x
+        short_x = base_x[-self.short_window_size:]
+        long_x = base_x[:-self.short_window_size]
+        ## Anchor Feature Generator
+        pe_x = self.positional_encoding(short_x)
+        encoded_x = self.encoder(pe_x)
+        decoder_token = self.decoder_token.expand(-1, encoded_x.shape[1], -1)
+        decoded_x = self.decoder(decoder_token, encoded_x)
+        decoded_x = decoded_x
+        ## Future-Supervised History Module
+        hist_encoded_x, snip_cls = self.history_unit(long_x, encoded_x)
+        ## History Driven Anchor Refinement
+        decoded_anchor_feat = self.history_anchor_decoder_block1(decoded_x, hist_encoded_x)
+        decoded_anchor_feat = decoded_anchor_feat + self.dropout1(decoded_x)
+        decoded_anchor_feat = self.norm1(decoded_anchor_feat)
+        decoded_anchor_feat = decoded_anchor_feat.permute([1, 0, 2])
+        # Predition Module
+        anc_cls = self.classifier(decoded_anchor_feat)
+        anc_reg = self.regressor(decoded_anchor_feat)
+        return anc_cls, anc_reg, snip_cls
+class SuppressNet(torch.nn.Module):
+    def __init__(self, opt):
+        super(SuppressNet, self).__init__()
+        n_class=opt["num_of_class"]-1
+        n_seglen=opt["segment_size"]
+        n_embedding_dim=2*n_seglen
+        dropout=0.3
+        self.best_loss=1000000
+        self.best_map=0
+        # FC layers for the 2 streams
+        self.mlp1 = nn.Linear(n_seglen, n_embedding_dim)
+        self.mlp2 = nn.Linear(n_embedding_dim, 1)
+        self.norm = nn.InstanceNorm1d(n_class)
+        self.relu = nn.ReLU(True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, inputs):
+        #inputs - batch x seq_len x class
+        base_x = inputs.permute([0,2,1])
+        base_x = self.norm(base_x)
+        x = self.relu(self.mlp1(base_x))
+        x = self.sigmoid(self.mlp2(x))
+        x = x.squeeze(-1)
+        return x

opts_egtea.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    # Overall settings
+    parser.add_argument('--mode', type=str, default='train')
+    parser.add_argument('--video_name', type=str, default=None, help='Name of the single video to evaluate')
+    parser.add_argument('--video_path', type=str, default='', help='Path to the input video file for visualization')
+    parser.add_argument('--checkpoint_path', type=str, default='./checkpoint')
+    parser.add_argument('--segment_size', type=int, default=64)
+    parser.add_argument('--anchors', type=str, default='2,4,6,8,12,16')
+    parser.add_argument('--seed', default=7, type=int, help='random seed for reproducibility')
+    # Overall Dataset settings
+    parser.add_argument('--num_of_class', type=int, default=23)
+    parser.add_argument('--data_format', type=str, default="npz_i3d")
+    parser.add_argument('--data_rescale', default=False, action='store_true')
+    parser.add_argument('--predefined_fps', default=None, type=float)
+    parser.add_argument('--rgb_only', default=False, action='store_true')
+    parser.add_argument('--video_anno', type=str, default="./data/egtea_annotations_split{}.json")
+    parser.add_argument('--video_feature_all_train', type=str, default="./data/I3D/")
+    parser.add_argument('--video_feature_all_test', type=str, default="./data/I3D/")
+    parser.add_argument('--setup', type=str, default="")
+    parser.add_argument('--exp', type=str, default="01")
+    parser.add_argument('--split', type=str, default="1")
+    # Network
+    parser.add_argument('--feat_dim', type=int, default=2048)
+    parser.add_argument('--hidden_dim', type=int, default=1024)
+    parser.add_argument('--out_dim', type=int, default=23)
+    parser.add_argument('--enc_layer', type=int, default=3)
+    parser.add_argument('--enc_head', type=int, default=8)
+    parser.add_argument('--dec_layer', type=int, default=5)
+    parser.add_argument('--dec_head', type=int, default=4)
+    # Training settings
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--lr', type=float, default=1e-4)
+    parser.add_argument('--weight_decay', type=float, default=1e-4)
+    parser.add_argument('--epoch', type=int, default=5)
+    parser.add_argument('--lr_step', type=int, default=3)
+    # Post processing
+    parser.add_argument('--alpha', type=float, default=1)
+    parser.add_argument('--beta', type=float, default=1)
+    parser.add_argument('--gamma', type=float, default=0.2)
+    parser.add_argument('--pptype', type=str, default="net")
+    parser.add_argument('--pos_threshold', type=float, default=0.5)
+    parser.add_argument('--sup_threshold', type=float, default=0.1)
+    parser.add_argument('--threshold', type=float, default=0.1)
+    parser.add_argument('--inference_subset', type=str, default="test")
+    parser.add_argument('--soft_nms', type=float, default=0.3)
+    parser.add_argument('--video_len_file', type=str, default="./output/video_len_{}.json")
+    parser.add_argument('--proposal_label_file', type=str, default="./output/proposal_label_{}.h5")
+    parser.add_argument('--suppress_label_file', type=str, default="./output/suppress_label_{}.h5")
+    parser.add_argument('--suppress_result_file', type=str, default="./output/suppress_result{}.h5")
+    parser.add_argument('--frame_result_file', type=str, default="./output/frame_result{}.h5")
+    parser.add_argument('--result_file', type=str, default="./output/result_proposal{}.json")
+    parser.add_argument('--wterm', type=bool, default=False)
+    args = parser.parse_args()
+    return args

output/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ If there exist changes in the dataset, it is recommended to delete all files in this folder and execute the main function from the start.

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+h5py
+ipdb
+sklearn
+matplotlib
+tensorboardX

short main.py ADDED Viewed

The diff for this file is too large to render. See raw diff

supnet.py ADDED Viewed

	@@ -0,0 +1,637 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, SuppressDataSet
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, regress_loss_func, suppress_loss_func
+from tqdm import tqdm
+def train_one_epoch(opt, model, train_dataset, optimizer):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                                batch_size=opt['batch_size'], shuffle=True,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    epoch_cost = 0
+    for n_iter,(input_data,label) in enumerate(tqdm(train_loader)):
+        suppress_conf = model(input_data.cuda())
+        loss = suppress_loss_func(label,suppress_conf)
+        epoch_cost+= loss.detach().cpu().numpy()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    return n_iter, epoch_cost
+def eval_one_epoch(opt, model, test_dataset):
+    test_loader = torch.utils.data.DataLoader(test_dataset,
+                                                batch_size=opt['batch_size'], shuffle=False,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    epoch_cost = 0
+    for n_iter,(input_data,label) in enumerate(tqdm(test_loader)):
+        suppress_conf = model(input_data.cuda())
+        loss = suppress_loss_func(label,suppress_conf)
+        epoch_cost+= loss.detach().cpu().numpy()
+    return n_iter, epoch_cost
+def train(opt):
+    writer = SummaryWriter()
+    model = SuppressNet(opt).cuda()
+    optimizer = optim.Adam( model.parameters(),lr=opt["lr"],weight_decay = opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = opt["lr_step"])
+    train_dataset = SuppressDataSet(opt,subset="train")
+    test_dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+    for n_epoch in range(opt['epoch']):
+        n_iter, epoch_cost = train_one_epoch(opt, model, train_dataset, optimizer)
+        writer.add_scalars('sup_data/cost', {'train': epoch_cost/(n_iter+1)}, n_epoch)
+        print("training loss(epoch %d): %f, lr - %f"%(n_epoch,
+                                                                epoch_cost/(n_iter+1),
+                                                                optimizer.param_groups[0]["lr"]) )
+        scheduler.step()
+        model.eval()
+        n_iter, eval_cost = eval_one_epoch(opt, model,test_dataset)
+        writer.add_scalars('sup_data/eval', {'test': eval_cost/(n_iter+1)}, n_epoch)
+        print("testing loss(epoch %d): %f"%(n_epoch,eval_cost/(n_iter+1)))
+        state = {'epoch': n_epoch + 1,
+                    'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"]+"/checkpoint_suppress_"+str(n_epoch+1)+".pth.tar" )
+        if eval_cost < model.best_loss:
+            model.best_loss = eval_cost
+            torch.save(state, opt["checkpoint_path"]+"/ckp_best_suppress.pth.tar" )
+        model.train()
+    writer.close()
+    return
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                                batch_size=opt['batch_size'], shuffle=False,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    labels_cls={}
+    labels_reg={}
+    output_cls={}
+    output_reg={}
+    for video_name in dataset.video_list:
+        labels_cls[video_name]=[]
+        labels_reg[video_name]=[]
+        output_cls[video_name]=[]
+        output_reg[video_name]=[]
+    start_time = time.time()
+    total_frames =0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter,(input_data,cls_label,reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label,act_cls)
+        cost_cls = loss
+        epoch_cost_cls+= cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label,act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost= opt['alpha']*cost_cls +opt['beta']*cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames+=input_data.size(0)
+        for b in range(0,input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter*opt['batch_size']+b]
+            output_cls[video_name]+=[act_cls[b,:].detach().cpu().numpy()]
+            output_reg[video_name]+=[act_reg[b,:].detach().cpu().numpy()]
+            labels_cls[video_name]+=[cls_label[b,:].numpy()]
+            labels_reg[video_name]+=[reg_label[b,:].numpy()]
+    end_time = time.time()
+    working_time = end_time-start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name]=np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name]=np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name]=np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name]=np.stack(output_reg[video_name], axis=0)
+    cls_loss=epoch_cost_cls/n_iter
+    reg_loss=epoch_cost_reg/n_iter
+    tot_loss=epoch_cost/n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def test(opt):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"]+"/" + opt['exp'] + "ckp_best_suppress.pth.tar")
+    base_dict=checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                                batch_size=opt['batch_size'], shuffle=False,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    labels={}
+    output={}
+    for video_name in dataset.video_list:
+        labels[video_name]=[]
+        output[video_name]=[]
+    for n_iter,(input_data,label) in enumerate(test_loader):
+        suppress_conf = model(input_data.cuda())
+        for b in range(0,input_data.size(0)):
+            video_name, idx = dataset.inputs[n_iter*opt['batch_size']+b]
+            output[video_name]+=[suppress_conf[b,:].detach().cpu().numpy()]
+            labels[video_name]+=[label[b,:].numpy()]
+    for video_name in dataset.video_list:
+        labels[video_name]=np.stack(labels[video_name], axis=0)
+        output[video_name]=np.stack(output[video_name], axis=0)
+    outfile = h5py.File(opt['suppress_result_file'].format(opt['exp']), 'w')
+    for video_name in dataset.video_list:
+        o=output[video_name]
+        l=labels[video_name]
+        dset_pred = outfile.create_dataset(video_name+'/pred', o.shape, maxshape=o.shape, chunks=True, dtype=np.float32)
+        dset_pred[:,:] = o[:,:]
+        dset_label = outfile.create_dataset(video_name+'/label', l.shape, maxshape=l.shape, chunks=True, dtype=np.float32)
+        dset_label[:,:] = l[:,:]
+    outfile.close()
+    print('complete')
+def make_dataset(opt):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"]+"/"+opt['exp']+"_ckp_best.pth.tar")
+    base_dict=checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt,subset=opt['inference_subset'])
+    _, _, _, output_cls, output_reg, labels_cls, labels_reg, _, _ = eval_frame(opt, model,dataset)
+    proposal_dict=[]
+    outfile = h5py.File(opt['suppress_label_file'].format(opt['inference_subset']+'_'+opt['setup']), 'w')
+    num_class = opt["num_of_class"]-1
+    unit_size = opt['segment_size']
+    threshold=opt['threshold']
+    anchors=opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        for idx in range(0,duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict=[]
+            for anc_idx in range(0,len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1]>opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed= idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx]* np.exp(reg_anc[anc_idx][1])
+                st= ed-length
+                for cidx in range(0,len(cls)):
+                    label=cls[cidx]
+                    tmp_dict={}
+                    tmp_dict["segment"] = [st, ed]
+                    tmp_dict["score"]= cls_anc[anc_idx][label]
+                    tmp_dict["label"]=label
+                    tmp_dict["gentime"]= idx
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            proposal_dict+=proposal_anc_dict
+        nms_dict=non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        input_table = np.zeros((duration,unit_size,num_class), dtype=np.float32)
+        label_table = np.zeros((duration,num_class), dtype=np.float32)
+        for proposal in proposal_dict:
+            idx = proposal["gentime"]
+            conf = proposal["score"]
+            cls = proposal["label"]
+            for i in range(0,unit_size):
+                if idx+i < duration:
+                    input_table[idx+i,unit_size-1-i,cls]=conf
+        for proposal in nms_dict:
+            idx = proposal["gentime"]
+            cls = proposal["label"]
+            label_table[idx:idx+3,cls]=1
+        dset_input_table = outfile.create_dataset(video_name+'/input', input_table.shape, maxshape=input_table.shape, chunks=True, dtype=np.float32)
+        dset_label_table = outfile.create_dataset(video_name+'/label', label_table.shape, maxshape=label_table.shape, chunks=True, dtype=np.float32)
+        dset_input_table[:]=input_table
+        dset_label_table[:]=label_table
+        proposal_dict=[]
+    print('complete')
+    return
+def main(opt):
+    if opt['mode'] == 'train':
+        train(opt)
+    if opt['mode'] == 'test':
+        test(opt)
+    if opt['mode'] == 'make':
+        make_dataset(opt)
+    return
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file=open(opt["checkpoint_path"]+"/"+opt['exp']+"_opts.json","w")
+    json.dump(opt,opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        #random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    main(opt)
+    while(opt['wterm']):
+        pass
+# import os
+# import json
+# import torch
+# import torchvision
+# import torch.nn.parallel
+# import torch.nn.functional as F
+# import torch.optim as optim
+# import numpy as np
+# # import opts_egtea as opts
+# import opts_thumos as opts
+# import time
+# import h5py
+# from iou_utils import *
+# from eval import evaluation_detection
+# from tensorboardX import SummaryWriter
+# from dataset import VideoDataSet, SuppressDataSet
+# from models import MYNET, SuppressNet
+# from loss_func import cls_loss_func, regress_loss_func, suppress_loss_func
+# from tqdm import tqdm
+# def train_one_epoch(opt, model, train_dataset, optimizer):
+#     train_loader = torch.utils.data.DataLoader(train_dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=True,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     epoch_cost = 0
+#     for n_iter,(input_data,label) in enumerate(tqdm(train_loader)):
+#         suppress_conf = model(input_data.cuda())
+#         loss = suppress_loss_func(label,suppress_conf)
+#         epoch_cost+= loss.detach().cpu().numpy()
+#         optimizer.zero_grad()
+#         loss.backward()
+#         optimizer.step()
+#     return n_iter, epoch_cost
+# def eval_one_epoch(opt, model, test_dataset):
+#     test_loader = torch.utils.data.DataLoader(test_dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=False,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     epoch_cost = 0
+#     for n_iter,(input_data,label) in enumerate(tqdm(test_loader)):
+#         suppress_conf = model(input_data.cuda())
+#         loss = suppress_loss_func(label,suppress_conf)
+#         epoch_cost+= loss.detach().cpu().numpy()
+#     return n_iter, epoch_cost
+# def train(opt):
+#     writer = SummaryWriter()
+#     model = SuppressNet(opt).cuda()
+#     optimizer = optim.Adam( model.parameters(),lr=opt["lr"],weight_decay = opt["weight_decay"])
+#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = opt["lr_step"])
+#     train_dataset = SuppressDataSet(opt,subset="train")
+#     test_dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+#     for n_epoch in range(opt['epoch']):
+#         n_iter, epoch_cost = train_one_epoch(opt, model, train_dataset, optimizer)
+#         writer.add_scalars('sup_data/cost', {'train': epoch_cost/(n_iter+1)}, n_epoch)
+#         print("training loss(epoch %d): %f, lr - %f"%(n_epoch,
+#                                                                 epoch_cost/(n_iter+1),
+#                                                                 optimizer.param_groups[0]["lr"]) )
+#         scheduler.step()
+#         model.eval()
+#         n_iter, eval_cost = eval_one_epoch(opt, model,test_dataset)
+#         writer.add_scalars('sup_data/eval', {'test': eval_cost/(n_iter+1)}, n_epoch)
+#         print("testing loss(epoch %d): %f"%(n_epoch,eval_cost/(n_iter+1)))
+#         state = {'epoch': n_epoch + 1,
+#                     'state_dict': model.state_dict()}
+#         torch.save(state, opt["checkpoint_path"]+"/checkpoint_suppress_"+str(n_epoch+1)+".pth.tar" )
+#         if eval_cost < model.best_loss:
+#             model.best_loss = eval_cost
+#             torch.save(state, opt["checkpoint_path"]+"/ckp_best_suppress.pth.tar" )
+#         model.train()
+#     writer.close()
+#     return
+# def eval_frame(opt, model, dataset):
+#     test_loader = torch.utils.data.DataLoader(dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=False,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     labels_cls={}
+#     labels_reg={}
+#     output_cls={}
+#     output_reg={}
+#     for video_name in dataset.video_list:
+#         labels_cls[video_name]=[]
+#         labels_reg[video_name]=[]
+#         output_cls[video_name]=[]
+#         output_reg[video_name]=[]
+#     start_time = time.time()
+#     total_frames =0
+#     epoch_cost = 0
+#     epoch_cost_cls = 0
+#     epoch_cost_reg = 0
+#     for n_iter,(input_data,cls_label,reg_label, _) in enumerate(tqdm(test_loader)):
+#         act_cls, act_reg, _ = model(input_data.cuda())
+#         cost_reg = 0
+#         cost_cls = 0
+#         loss = cls_loss_func(cls_label,act_cls)
+#         cost_cls = loss
+#         epoch_cost_cls+= cost_cls.detach().cpu().numpy()
+#         loss = regress_loss_func(reg_label,act_reg)
+#         cost_reg = loss
+#         epoch_cost_reg += cost_reg.detach().cpu().numpy()
+#         cost= opt['alpha']*cost_cls +opt['beta']*cost_reg
+#         epoch_cost += cost.detach().cpu().numpy()
+#         act_cls = torch.softmax(act_cls, dim=-1)
+#         total_frames+=input_data.size(0)
+#         for b in range(0,input_data.size(0)):
+#             video_name, st, ed, data_idx = dataset.inputs[n_iter*opt['batch_size']+b]
+#             output_cls[video_name]+=[act_cls[b,:].detach().cpu().numpy()]
+#             output_reg[video_name]+=[act_reg[b,:].detach().cpu().numpy()]
+#             labels_cls[video_name]+=[cls_label[b,:].numpy()]
+#             labels_reg[video_name]+=[reg_label[b,:].numpy()]
+#     end_time = time.time()
+#     working_time = end_time-start_time
+#     for video_name in dataset.video_list:
+#         labels_cls[video_name]=np.stack(labels_cls[video_name], axis=0)
+#         labels_reg[video_name]=np.stack(labels_reg[video_name], axis=0)
+#         output_cls[video_name]=np.stack(output_cls[video_name], axis=0)
+#         output_reg[video_name]=np.stack(output_reg[video_name], axis=0)
+#     cls_loss=epoch_cost_cls/n_iter
+#     reg_loss=epoch_cost_reg/n_iter
+#     tot_loss=epoch_cost/n_iter
+#     return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+# def test(opt):
+#     model = SuppressNet(opt).cuda()
+#     checkpoint = torch.load(opt["checkpoint_path"]+"/" + opt['exp'] + "ckp_best_suppress.pth.tar")
+#     base_dict=checkpoint['state_dict']
+#     model.load_state_dict(base_dict)
+#     model.eval()
+#     dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+#     test_loader = torch.utils.data.DataLoader(dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=False,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     labels={}
+#     output={}
+#     for video_name in dataset.video_list:
+#         labels[video_name]=[]
+#         output[video_name]=[]
+#     for n_iter,(input_data,label) in enumerate(test_loader):
+#         suppress_conf = model(input_data.cuda())
+#         for b in range(0,input_data.size(0)):
+#             video_name, idx = dataset.inputs[n_iter*opt['batch_size']+b]
+#             output[video_name]+=[suppress_conf[b,:].detach().cpu().numpy()]
+#             labels[video_name]+=[label[b,:].numpy()]
+#     for video_name in dataset.video_list:
+#         labels[video_name]=np.stack(labels[video_name], axis=0)
+#         output[video_name]=np.stack(output[video_name], axis=0)
+#     outfile = h5py.File(opt['suppress_result_file'].format(opt['exp']), 'w')
+#     for video_name in dataset.video_list:
+#         o=output[video_name]
+#         l=labels[video_name]
+#         dset_pred = outfile.create_dataset(video_name+'/pred', o.shape, maxshape=o.shape, chunks=True, dtype=np.float32)
+#         dset_pred[:,:] = o[:,:]
+#         dset_label = outfile.create_dataset(video_name+'/label', l.shape, maxshape=l.shape, chunks=True, dtype=np.float32)
+#         dset_label[:,:] = l[:,:]
+#     outfile.close()
+#     print('complete')
+# def make_dataset(opt):
+#     model = MYNET(opt).cuda()
+#     checkpoint = torch.load(opt["checkpoint_path"]+"/"+opt['exp']+"_ckp_best.pth.tar")
+#     base_dict=checkpoint['state_dict']
+#     model.load_state_dict(base_dict)
+#     model.eval()
+#     # Fix: Set the 'split' key to match 'inference_subset'
+#     opt['split'] = opt['inference_subset']
+#     dataset = VideoDataSet(opt,subset=opt['inference_subset'])
+#     _, _, _, output_cls, output_reg, labels_cls, labels_reg, _, _ = eval_frame(opt, model,dataset)
+#     proposal_dict=[]
+#     outfile = h5py.File(opt['suppress_label_file'].format(opt['inference_subset']+'_'+opt['setup']), 'w')
+#     num_class = opt["num_of_class"]-1
+#     unit_size = opt['segment_size']
+#     threshold=opt['threshold']
+#     anchors=opt['anchors']
+#     for video_name in dataset.video_list:
+#         duration = dataset.video_len[video_name]
+#         for idx in range(0,duration):
+#             cls_anc = output_cls[video_name][idx]
+#             reg_anc = output_reg[video_name][idx]
+#             proposal_anc_dict=[]
+#             for anc_idx in range(0,len(anchors)):
+#                 cls = np.argwhere(cls_anc[anc_idx][:-1]>opt['threshold']).reshape(-1)
+#                 if len(cls) == 0:
+#                     continue
+#                 ed= idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+#                 length = anchors[anc_idx]* np.exp(reg_anc[anc_idx][1])
+#                 st= ed-length
+#                 for cidx in range(0,len(cls)):
+#                     label=cls[cidx]
+#                     tmp_dict={}
+#                     tmp_dict["segment"] = [st, ed]
+#                     tmp_dict["score"]= cls_anc[anc_idx][label]
+#                     tmp_dict["label"]=label
+#                     tmp_dict["gentime"]= idx
+#                     proposal_anc_dict.append(tmp_dict)
+#             proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+#             proposal_dict+=proposal_anc_dict
+#         nms_dict=non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+#         input_table = np.zeros((duration,unit_size,num_class), dtype=np.float32)
+#         label_table = np.zeros((duration,num_class), dtype=np.float32)
+#         for proposal in proposal_dict:
+#             idx = proposal["gentime"]
+#             conf = proposal["score"]
+#             cls = proposal["label"]
+#             for i in range(0,unit_size):
+#                 if idx+i < duration:
+#                     input_table[idx+i,unit_size-1-i,cls]=conf
+#         for proposal in nms_dict:
+#             idx = proposal["gentime"]
+#             cls = proposal["label"]
+#             label_table[idx:idx+3,cls]=1
+#         dset_input_table = outfile.create_dataset(video_name+'/input', input_table.shape, maxshape=input_table.shape, chunks=True, dtype=np.float32)
+#         dset_label_table = outfile.create_dataset(video_name+'/label', label_table.shape, maxshape=label_table.shape, chunks=True, dtype=np.float32)
+#         dset_input_table[:]=input_table
+#         dset_label_table[:]=label_table
+#         proposal_dict=[]
+#     outfile.close()  # Added missing close() call
+#     print('complete')
+#     return
+# def main(opt):
+#     if opt['mode'] == 'train':
+#         train(opt)
+#     if opt['mode'] == 'test':
+#         test(opt)
+#     if opt['mode'] == 'make':
+#         make_dataset(opt)
+#     return
+# if __name__ == '__main__':
+#     opt = opts.parse_opt()
+#     opt = vars(opt)
+#     if not os.path.exists(opt["checkpoint_path"]):
+#         os.makedirs(opt["checkpoint_path"])
+#     opt_file=open(opt["checkpoint_path"]+"/"+opt['exp']+"_opts.json","w")
+#     json.dump(opt,opt_file)
+#     opt_file.close()
+#     if opt['seed'] >= 0:
+#         seed = opt['seed']
+#         torch.manual_seed(seed)
+#         np.random.seed(seed)
+#         #random.seed(seed)
+#     opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+#     main(opt)
+#     while(opt['wterm']):
+#         pass