mmaction2 / tools /data /activitynet /convert_proposal_format.py

mmaction2

d3dbf03 verified 8 months ago

6.34 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	"""This file converts the output proposal file of proposal generator (BSN, BMN)
	into the input proposal file of action classifier (Currently supports SSN and
	P-GCN, not including TSN, I3D etc.)."""
	import argparse

	import mmengine
	import numpy as np

	from mmaction.evaluation import pairwise_temporal_iou


	def load_annotations(ann_file):
	"""Load the annotation according to ann_file into video_infos."""
	video_infos = []
	anno_database = mmengine.load(ann_file)
	for video_name in anno_database:
	video_info = anno_database[video_name]
	video_info['video_name'] = video_name
	video_infos.append(video_info)
	return video_infos


	def import_ground_truth(video_infos, activity_index):
	"""Read ground truth data from video_infos."""
	ground_truth = {}
	for video_info in video_infos:
	video_id = video_info['video_name'][2:]
	this_video_ground_truths = []
	for ann in video_info['annotations']:
	t_start, t_end = ann['segment']
	label = activity_index[ann['label']]
	this_video_ground_truths.append([t_start, t_end, label])
	ground_truth[video_id] = np.array(this_video_ground_truths)
	return ground_truth


	def import_proposals(result_dict):
	"""Read predictions from result dict."""
	proposals = {}
	num_proposals = 0
	for video_id in result_dict:
	result = result_dict[video_id]
	this_video_proposals = []
	for proposal in result:
	t_start, t_end = proposal['segment']
	score = proposal['score']
	this_video_proposals.append([t_start, t_end, score])
	num_proposals += 1
	proposals[video_id] = np.array(this_video_proposals)
	return proposals, num_proposals


	def dump_formatted_proposal(video_idx, video_id, num_frames, fps, gts,
	proposals, tiou, t_overlap_self,
	formatted_proposal_file):
	"""dump the formatted proposal file, which is the input proposal file of
	action classifier (e.g: SSN).

	Args:
	video_idx (int): Index of video.
	video_id (str): ID of video.
	num_frames (int): Total frames of the video.
	fps (float): Fps of the video.
	gts (np.ndarray[float]): t_start, t_end and label of groundtruths.
	proposals (np.ndarray[float]): t_start, t_end and score of proposals.
	tiou (np.ndarray[float]): 2-dim array with IoU ratio.
	t_overlap_self (np.ndarray[float]): 2-dim array with overlap_self
	(union / self_len) ratio.
	formatted_proposal_file (open file object): Open file object of
	formatted_proposal_file.
	"""

	formatted_proposal_file.write(
	f'#{video_idx}\n{video_id}\n{num_frames}\n{fps}\n{gts.shape[0]}\n')
	for gt in gts:
	formatted_proposal_file.write(f'{int(gt[2])} {gt[0]} {gt[1]}\n')
	formatted_proposal_file.write(f'{proposals.shape[0]}\n')

	best_iou = np.amax(tiou, axis=0)
	best_iou_index = np.argmax(tiou, axis=0)
	best_overlap = np.amax(t_overlap_self, axis=0)
	best_overlap_index = np.argmax(t_overlap_self, axis=0)

	for i in range(proposals.shape[0]):
	index_iou = best_iou_index[i]
	index_overlap = best_overlap_index[i]
	label_iou = gts[index_iou][2]
	label_overlap = gts[index_overlap][2]
	if label_iou != label_overlap:
	label = label_iou if label_iou != 0 else label_overlap
	else:
	label = label_iou
	if best_iou[i] == 0 and best_overlap[i] == 0:
	formatted_proposal_file.write(
	f'0 0 0 {proposals[i][0]} {proposals[i][1]}\n')
	else:
	formatted_proposal_file.write(
	f'{int(label)} {best_iou[i]} {best_overlap[i]} '
	f'{proposals[i][0]} {proposals[i][1]}\n')


	def parse_args():
	parser = argparse.ArgumentParser(description='convert proposal format')
	parser.add_argument(
	'--ann-file',
	type=str,
	default='../../../data/ActivityNet/anet_anno_val.json',
	help='name of annotation file')
	parser.add_argument(
	'--activity-index-file',
	type=str,
	default='../../../data/ActivityNet/anet_activity_indexes_val.txt',
	help='name of activity index file')
	parser.add_argument(
	'--proposal-file',
	type=str,
	default='../../../results.json',
	help='name of proposal file, which is the'
	'output of proposal generator (BMN)')
	parser.add_argument(
	'--formatted-proposal-file',
	type=str,
	default='../../../anet_val_formatted_proposal.txt',
	help='name of formatted proposal file, which is the'
	'input of action classifier (SSN)')
	args = parser.parse_args()

	return args


	if __name__ == '__main__':
	args = parse_args()
	formatted_proposal_file = open(args.formatted_proposal_file, 'w')

	# The activity index file is constructed according to
	# 'https://github.com/activitynet/ActivityNet/blob/master/Evaluation/eval_classification.py'
	activity_index, class_idx = {}, 0
	for line in open(args.activity_index_file).readlines():
	activity_index[line.strip()] = class_idx
	class_idx += 1

	video_infos = load_annotations(args.ann_file)
	ground_truth = import_ground_truth(video_infos, activity_index)
	proposal, num_proposals = import_proposals(
	mmengine.load(args.proposal_file)['results'])
	video_idx = 0

	for video_info in video_infos:
	video_id = video_info['video_name'][2:]
	num_frames = video_info['duration_frame']
	fps = video_info['fps']
	tiou, t_overlap = pairwise_temporal_iou(
	proposal[video_id][:, :2].astype(float),
	ground_truth[video_id][:, :2].astype(float),
	calculate_overlap_self=True)

	dump_formatted_proposal(video_idx, video_id, num_frames, fps,
	ground_truth[video_id], proposal[video_id],
	tiou, t_overlap, formatted_proposal_file)
	video_idx += 1
	formatted_proposal_file.close()