|
|
|
|
|
"""This file converts the output proposal file of proposal generator (BSN, BMN)
|
|
|
into the input proposal file of action classifier (Currently supports SSN and
|
|
|
P-GCN, not including TSN, I3D etc.)."""
|
|
|
import argparse
|
|
|
|
|
|
import mmengine
|
|
|
import numpy as np
|
|
|
|
|
|
from mmaction.evaluation import pairwise_temporal_iou
|
|
|
|
|
|
|
|
|
def load_annotations(ann_file):
|
|
|
"""Load the annotation according to ann_file into video_infos."""
|
|
|
video_infos = []
|
|
|
anno_database = mmengine.load(ann_file)
|
|
|
for video_name in anno_database:
|
|
|
video_info = anno_database[video_name]
|
|
|
video_info['video_name'] = video_name
|
|
|
video_infos.append(video_info)
|
|
|
return video_infos
|
|
|
|
|
|
|
|
|
def import_ground_truth(video_infos, activity_index):
|
|
|
"""Read ground truth data from video_infos."""
|
|
|
ground_truth = {}
|
|
|
for video_info in video_infos:
|
|
|
video_id = video_info['video_name'][2:]
|
|
|
this_video_ground_truths = []
|
|
|
for ann in video_info['annotations']:
|
|
|
t_start, t_end = ann['segment']
|
|
|
label = activity_index[ann['label']]
|
|
|
this_video_ground_truths.append([t_start, t_end, label])
|
|
|
ground_truth[video_id] = np.array(this_video_ground_truths)
|
|
|
return ground_truth
|
|
|
|
|
|
|
|
|
def import_proposals(result_dict):
|
|
|
"""Read predictions from result dict."""
|
|
|
proposals = {}
|
|
|
num_proposals = 0
|
|
|
for video_id in result_dict:
|
|
|
result = result_dict[video_id]
|
|
|
this_video_proposals = []
|
|
|
for proposal in result:
|
|
|
t_start, t_end = proposal['segment']
|
|
|
score = proposal['score']
|
|
|
this_video_proposals.append([t_start, t_end, score])
|
|
|
num_proposals += 1
|
|
|
proposals[video_id] = np.array(this_video_proposals)
|
|
|
return proposals, num_proposals
|
|
|
|
|
|
|
|
|
def dump_formatted_proposal(video_idx, video_id, num_frames, fps, gts,
|
|
|
proposals, tiou, t_overlap_self,
|
|
|
formatted_proposal_file):
|
|
|
"""dump the formatted proposal file, which is the input proposal file of
|
|
|
action classifier (e.g: SSN).
|
|
|
|
|
|
Args:
|
|
|
video_idx (int): Index of video.
|
|
|
video_id (str): ID of video.
|
|
|
num_frames (int): Total frames of the video.
|
|
|
fps (float): Fps of the video.
|
|
|
gts (np.ndarray[float]): t_start, t_end and label of groundtruths.
|
|
|
proposals (np.ndarray[float]): t_start, t_end and score of proposals.
|
|
|
tiou (np.ndarray[float]): 2-dim array with IoU ratio.
|
|
|
t_overlap_self (np.ndarray[float]): 2-dim array with overlap_self
|
|
|
(union / self_len) ratio.
|
|
|
formatted_proposal_file (open file object): Open file object of
|
|
|
formatted_proposal_file.
|
|
|
"""
|
|
|
|
|
|
formatted_proposal_file.write(
|
|
|
f'#{video_idx}\n{video_id}\n{num_frames}\n{fps}\n{gts.shape[0]}\n')
|
|
|
for gt in gts:
|
|
|
formatted_proposal_file.write(f'{int(gt[2])} {gt[0]} {gt[1]}\n')
|
|
|
formatted_proposal_file.write(f'{proposals.shape[0]}\n')
|
|
|
|
|
|
best_iou = np.amax(tiou, axis=0)
|
|
|
best_iou_index = np.argmax(tiou, axis=0)
|
|
|
best_overlap = np.amax(t_overlap_self, axis=0)
|
|
|
best_overlap_index = np.argmax(t_overlap_self, axis=0)
|
|
|
|
|
|
for i in range(proposals.shape[0]):
|
|
|
index_iou = best_iou_index[i]
|
|
|
index_overlap = best_overlap_index[i]
|
|
|
label_iou = gts[index_iou][2]
|
|
|
label_overlap = gts[index_overlap][2]
|
|
|
if label_iou != label_overlap:
|
|
|
label = label_iou if label_iou != 0 else label_overlap
|
|
|
else:
|
|
|
label = label_iou
|
|
|
if best_iou[i] == 0 and best_overlap[i] == 0:
|
|
|
formatted_proposal_file.write(
|
|
|
f'0 0 0 {proposals[i][0]} {proposals[i][1]}\n')
|
|
|
else:
|
|
|
formatted_proposal_file.write(
|
|
|
f'{int(label)} {best_iou[i]} {best_overlap[i]} '
|
|
|
f'{proposals[i][0]} {proposals[i][1]}\n')
|
|
|
|
|
|
|
|
|
def parse_args():
|
|
|
parser = argparse.ArgumentParser(description='convert proposal format')
|
|
|
parser.add_argument(
|
|
|
'--ann-file',
|
|
|
type=str,
|
|
|
default='../../../data/ActivityNet/anet_anno_val.json',
|
|
|
help='name of annotation file')
|
|
|
parser.add_argument(
|
|
|
'--activity-index-file',
|
|
|
type=str,
|
|
|
default='../../../data/ActivityNet/anet_activity_indexes_val.txt',
|
|
|
help='name of activity index file')
|
|
|
parser.add_argument(
|
|
|
'--proposal-file',
|
|
|
type=str,
|
|
|
default='../../../results.json',
|
|
|
help='name of proposal file, which is the'
|
|
|
'output of proposal generator (BMN)')
|
|
|
parser.add_argument(
|
|
|
'--formatted-proposal-file',
|
|
|
type=str,
|
|
|
default='../../../anet_val_formatted_proposal.txt',
|
|
|
help='name of formatted proposal file, which is the'
|
|
|
'input of action classifier (SSN)')
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
return args
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
args = parse_args()
|
|
|
formatted_proposal_file = open(args.formatted_proposal_file, 'w')
|
|
|
|
|
|
|
|
|
|
|
|
activity_index, class_idx = {}, 0
|
|
|
for line in open(args.activity_index_file).readlines():
|
|
|
activity_index[line.strip()] = class_idx
|
|
|
class_idx += 1
|
|
|
|
|
|
video_infos = load_annotations(args.ann_file)
|
|
|
ground_truth = import_ground_truth(video_infos, activity_index)
|
|
|
proposal, num_proposals = import_proposals(
|
|
|
mmengine.load(args.proposal_file)['results'])
|
|
|
video_idx = 0
|
|
|
|
|
|
for video_info in video_infos:
|
|
|
video_id = video_info['video_name'][2:]
|
|
|
num_frames = video_info['duration_frame']
|
|
|
fps = video_info['fps']
|
|
|
tiou, t_overlap = pairwise_temporal_iou(
|
|
|
proposal[video_id][:, :2].astype(float),
|
|
|
ground_truth[video_id][:, :2].astype(float),
|
|
|
calculate_overlap_self=True)
|
|
|
|
|
|
dump_formatted_proposal(video_idx, video_id, num_frames, fps,
|
|
|
ground_truth[video_id], proposal[video_id],
|
|
|
tiou, t_overlap, formatted_proposal_file)
|
|
|
video_idx += 1
|
|
|
formatted_proposal_file.close()
|
|
|
|