File size: 6,344 Bytes
d3dbf03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# Copyright (c) OpenMMLab. All rights reserved.
"""This file converts the output proposal file of proposal generator (BSN, BMN)
into the input proposal file of action classifier (Currently supports SSN and
P-GCN, not including TSN, I3D etc.)."""
import argparse
import mmengine
import numpy as np
from mmaction.evaluation import pairwise_temporal_iou
def load_annotations(ann_file):
"""Load the annotation according to ann_file into video_infos."""
video_infos = []
anno_database = mmengine.load(ann_file)
for video_name in anno_database:
video_info = anno_database[video_name]
video_info['video_name'] = video_name
video_infos.append(video_info)
return video_infos
def import_ground_truth(video_infos, activity_index):
"""Read ground truth data from video_infos."""
ground_truth = {}
for video_info in video_infos:
video_id = video_info['video_name'][2:]
this_video_ground_truths = []
for ann in video_info['annotations']:
t_start, t_end = ann['segment']
label = activity_index[ann['label']]
this_video_ground_truths.append([t_start, t_end, label])
ground_truth[video_id] = np.array(this_video_ground_truths)
return ground_truth
def import_proposals(result_dict):
"""Read predictions from result dict."""
proposals = {}
num_proposals = 0
for video_id in result_dict:
result = result_dict[video_id]
this_video_proposals = []
for proposal in result:
t_start, t_end = proposal['segment']
score = proposal['score']
this_video_proposals.append([t_start, t_end, score])
num_proposals += 1
proposals[video_id] = np.array(this_video_proposals)
return proposals, num_proposals
def dump_formatted_proposal(video_idx, video_id, num_frames, fps, gts,
proposals, tiou, t_overlap_self,
formatted_proposal_file):
"""dump the formatted proposal file, which is the input proposal file of
action classifier (e.g: SSN).
Args:
video_idx (int): Index of video.
video_id (str): ID of video.
num_frames (int): Total frames of the video.
fps (float): Fps of the video.
gts (np.ndarray[float]): t_start, t_end and label of groundtruths.
proposals (np.ndarray[float]): t_start, t_end and score of proposals.
tiou (np.ndarray[float]): 2-dim array with IoU ratio.
t_overlap_self (np.ndarray[float]): 2-dim array with overlap_self
(union / self_len) ratio.
formatted_proposal_file (open file object): Open file object of
formatted_proposal_file.
"""
formatted_proposal_file.write(
f'#{video_idx}\n{video_id}\n{num_frames}\n{fps}\n{gts.shape[0]}\n')
for gt in gts:
formatted_proposal_file.write(f'{int(gt[2])} {gt[0]} {gt[1]}\n')
formatted_proposal_file.write(f'{proposals.shape[0]}\n')
best_iou = np.amax(tiou, axis=0)
best_iou_index = np.argmax(tiou, axis=0)
best_overlap = np.amax(t_overlap_self, axis=0)
best_overlap_index = np.argmax(t_overlap_self, axis=0)
for i in range(proposals.shape[0]):
index_iou = best_iou_index[i]
index_overlap = best_overlap_index[i]
label_iou = gts[index_iou][2]
label_overlap = gts[index_overlap][2]
if label_iou != label_overlap:
label = label_iou if label_iou != 0 else label_overlap
else:
label = label_iou
if best_iou[i] == 0 and best_overlap[i] == 0:
formatted_proposal_file.write(
f'0 0 0 {proposals[i][0]} {proposals[i][1]}\n')
else:
formatted_proposal_file.write(
f'{int(label)} {best_iou[i]} {best_overlap[i]} '
f'{proposals[i][0]} {proposals[i][1]}\n')
def parse_args():
parser = argparse.ArgumentParser(description='convert proposal format')
parser.add_argument(
'--ann-file',
type=str,
default='../../../data/ActivityNet/anet_anno_val.json',
help='name of annotation file')
parser.add_argument(
'--activity-index-file',
type=str,
default='../../../data/ActivityNet/anet_activity_indexes_val.txt',
help='name of activity index file')
parser.add_argument(
'--proposal-file',
type=str,
default='../../../results.json',
help='name of proposal file, which is the'
'output of proposal generator (BMN)')
parser.add_argument(
'--formatted-proposal-file',
type=str,
default='../../../anet_val_formatted_proposal.txt',
help='name of formatted proposal file, which is the'
'input of action classifier (SSN)')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
formatted_proposal_file = open(args.formatted_proposal_file, 'w')
# The activity index file is constructed according to
# 'https://github.com/activitynet/ActivityNet/blob/master/Evaluation/eval_classification.py'
activity_index, class_idx = {}, 0
for line in open(args.activity_index_file).readlines():
activity_index[line.strip()] = class_idx
class_idx += 1
video_infos = load_annotations(args.ann_file)
ground_truth = import_ground_truth(video_infos, activity_index)
proposal, num_proposals = import_proposals(
mmengine.load(args.proposal_file)['results'])
video_idx = 0
for video_info in video_infos:
video_id = video_info['video_name'][2:]
num_frames = video_info['duration_frame']
fps = video_info['fps']
tiou, t_overlap = pairwise_temporal_iou(
proposal[video_id][:, :2].astype(float),
ground_truth[video_id][:, :2].astype(float),
calculate_overlap_self=True)
dump_formatted_proposal(video_idx, video_id, num_frames, fps,
ground_truth[video_id], proposal[video_id],
tiou, t_overlap, formatted_proposal_file)
video_idx += 1
formatted_proposal_file.close()
|