Spaces:

Darknsu
/

HATTAL

Runtime error

App Files Files Community

HATTAL / single result main.py

Darknsu

Upload 24 files

a51395e verified 10 months ago

raw

history blame contribute delete

22.5 kB

	import os
	import json
	import torch
	import torchvision
	import torch.nn.parallel
	import torch.nn.functional as F
	import torch.optim as optim
	import numpy as np
	import opts_egtea as opts

	import time
	import h5py
	from tqdm import tqdm
	from iou_utils import *
	from eval import evaluation_detection
	from tensorboardX import SummaryWriter
	from dataset import VideoDataSet
	from models import MYNET, SuppressNet
	from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
	from loss_func import MultiCrossEntropyLoss
	from functools import *

	def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
	train_loader = torch.utils.data.DataLoader(train_dataset,
	batch_size=opt['batch_size'], shuffle=True,
	num_workers=0, pin_memory=True, drop_last=False)
	epoch_cost = 0
	epoch_cost_cls = 0
	epoch_cost_reg = 0
	epoch_cost_snip = 0

	total_iter = len(train_dataset) // opt['batch_size']
	cls_loss = MultiCrossEntropyLoss(focal=True)
	snip_loss = MultiCrossEntropyLoss(focal=True)
	for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
	if warmup:
	for g in optimizer.param_groups:
	g['lr'] = n_iter * (opt['lr']) / total_iter

	act_cls, act_reg, snip_cls = model(input_data.float().cuda())

	act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
	snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))

	cost_reg = 0
	cost_cls = 0

	loss = cls_loss_func_(cls_loss, cls_label, act_cls)
	cost_cls = loss
	epoch_cost_cls += cost_cls.detach().cpu().numpy()

	loss = regress_loss_func(reg_label, act_reg)
	cost_reg = loss
	epoch_cost_reg += cost_reg.detach().cpu().numpy()

	loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
	cost_snip = loss
	epoch_cost_snip += cost_snip.detach().cpu().numpy()

	cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
	epoch_cost += cost.detach().cpu().numpy()

	optimizer.zero_grad()
	cost.backward()
	optimizer.step()

	return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip

	def eval_one_epoch(opt, model, test_dataset):
	cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)

	result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
	output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
	outfile = open(opt["result_file"].format(opt['exp']), "w")
	json.dump(output_dict, outfile, indent=2)
	outfile.close()

	IoUmAP = evaluation_detection(opt, verbose=False)
	IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])

	return cls_loss, reg_loss, tot_loss, IoUmAP_5

	def train(opt):
	writer = SummaryWriter()
	model = MYNET(opt).cuda()

	rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
	optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
	scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])

	train_dataset = VideoDataSet(opt, subset="train")
	test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])

	warmup = False

	for n_epoch in range(opt['epoch']):
	if n_epoch >= 1:
	warmup = False

	n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)

	writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
	print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
	epoch_cost / (n_iter + 1),
	epoch_cost_cls / (n_iter + 1),
	epoch_cost_reg / (n_iter + 1),
	epoch_cost_snip / (n_iter + 1),
	optimizer.param_groups[-1]["lr"]))

	scheduler.step()
	model.eval()

	cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)

	writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
	print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))

	state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
	torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
	if IoUmAP_5 > model.best_map:
	model.best_map = IoUmAP_5
	torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")

	model.train()

	writer.close()
	return model.best_map

	def eval_frame(opt, model, dataset):
	test_loader = torch.utils.data.DataLoader(dataset,
	batch_size=opt['batch_size'], shuffle=False,
	num_workers=0, pin_memory=True, drop_last=False)

	labels_cls = {}
	labels_reg = {}
	output_cls = {}
	output_reg = {}
	for video_name in dataset.video_list:
	labels_cls[video_name] = []
	labels_reg[video_name] = []
	output_cls[video_name] = []
	output_reg[video_name] = []

	start_time = time.time()
	total_frames = 0
	epoch_cost = 0
	epoch_cost_cls = 0
	epoch_cost_reg = 0

	for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
	act_cls, act_reg, _ = model(input_data.float().cuda())
	cost_reg = 0
	cost_cls = 0

	loss = cls_loss_func(cls_label, act_cls)
	cost_cls = loss
	epoch_cost_cls += cost_cls.detach().cpu().numpy()

	loss = regress_loss_func(reg_label, act_reg)
	cost_reg = loss
	epoch_cost_reg += cost_reg.detach().cpu().numpy()

	cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
	epoch_cost += cost.detach().cpu().numpy()

	act_cls = torch.softmax(act_cls, dim=-1)

	total_frames += input_data.size(0)

	for b in range(0, input_data.size(0)):
	video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
	output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
	output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
	labels_cls[video_name] += [cls_label[b, :].numpy()]
	labels_reg[video_name] += [reg_label[b, :].numpy()]

	end_time = time.time()
	working_time = end_time - start_time

	for video_name in dataset.video_list:
	labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
	labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
	output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
	output_reg[video_name] = np.stack(output_reg[video_name], axis=0)

	cls_loss = epoch_cost_cls / n_iter
	reg_loss = epoch_cost_reg / n_iter
	tot_loss = epoch_cost / n_iter

	return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames

	def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
	result_dict = {}
	proposal_dict = []

	num_class = opt["num_of_class"]
	unit_size = opt['segment_size']
	threshold = opt['threshold']
	anchors = opt['anchors']

	for video_name in dataset.video_list:
	duration = dataset.video_len[video_name]
	video_time = float(dataset.video_dict[video_name]["duration"])
	frame_to_time = 100.0 * video_time / duration

	for idx in range(0, duration):
	cls_anc = output_cls[video_name][idx]
	reg_anc = output_reg[video_name][idx]

	proposal_anc_dict = []
	for anc_idx in range(0, len(anchors)):
	cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)

	if len(cls) == 0:
	continue

	ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
	length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
	st = ed - length

	for cidx in range(0, len(cls)):
	label = cls[cidx]
	tmp_dict = {}
	tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
	tmp_dict["score"] = float(cls_anc[anc_idx][label])
	tmp_dict["label"] = dataset.label_name[label]
	tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
	proposal_anc_dict.append(tmp_dict)

	proposal_dict += proposal_anc_dict

	proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
	result_dict[video_name] = proposal_dict
	proposal_dict = []

	return result_dict

	def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
	model = SuppressNet(opt).cuda()
	checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
	base_dict = checkpoint['state_dict']
	model.load_state_dict(base_dict)
	model.eval()

	result_dict = {}
	proposal_dict = []

	num_class = opt["num_of_class"]
	unit_size = opt['segment_size']
	threshold = opt['threshold']
	anchors = opt['anchors']

	for video_name in dataset.video_list:
	duration = dataset.video_len[video_name]
	video_time = float(dataset.video_dict[video_name]["duration"])
	frame_to_time = 100.0 * video_time / duration
	conf_queue = torch.zeros((unit_size, num_class - 1))

	for idx in range(0, duration):
	cls_anc = output_cls[video_name][idx]
	reg_anc = output_reg[video_name][idx]

	proposal_anc_dict = []
	for anc_idx in range(0, len(anchors)):
	cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)

	if len(cls) == 0:
	continue

	ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
	length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
	st = ed - length

	for cidx in range(0, len(cls)):
	label = cls[cidx]
	tmp_dict = {}
	tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
	tmp_dict["score"] = float(cls_anc[anc_idx][label])
	tmp_dict["label"] = dataset.label_name[label]
	tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
	proposal_anc_dict.append(tmp_dict)

	proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])

	conf_queue[:-1, :] = conf_queue[1:, :].clone()
	conf_queue[-1, :] = 0
	for proposal in proposal_anc_dict:
	cls_idx = dataset.label_name.index(proposal['label'])
	conf_queue[-1, cls_idx] = proposal["score"]

	minput = conf_queue.unsqueeze(0)
	suppress_conf = model(minput.cuda())
	suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()

	for cls in range(0, num_class - 1):
	if suppress_conf[cls] > opt['sup_threshold']:
	for proposal in proposal_anc_dict:
	if proposal['label'] == dataset.label_name[cls]:
	if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
	proposal_dict.append(proposal)

	result_dict[video_name] = proposal_dict
	proposal_dict = []

	return result_dict

	def test_frame(opt, video_name=None):
	model = MYNET(opt).cuda()
	checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
	base_dict = checkpoint['state_dict']
	model.load_state_dict(base_dict)
	model.eval()

	dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
	outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')

	cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)

	print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))

	for video_name in dataset.video_list:
	o_cls = output_cls[video_name]
	o_reg = output_reg[video_name]
	l_cls = labels_cls[video_name]
	l_reg = labels_reg[video_name]

	dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
	dset_predcls[:, :] = o_cls[:, :]
	dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
	dset_predreg[:, :] = o_reg[:, :]
	dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
	dset_labelcls[:, :] = l_cls[:, :]
	dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
	dset_labelreg[:, :] = l_reg[:, :]
	outfile.close()

	print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
	return cls_loss, reg_loss, tot_loss

	def patch_attention(m):
	forward_orig = m.forward

	def wrap(args, *kwargs):
	kwargs["need_weights"] = True
	kwargs["average_attn_weights"] = False
	return forward_orig(args, *kwargs)

	m.forward = wrap

	class SaveOutput:
	def __init__(self):
	self.outputs = []

	def __call__(self, module, module_in, module_out):
	self.outputs.append(module_out[1])

	def clear(self):
	self.outputs = []

	def test(opt, video_name=None):
	model = MYNET(opt).cuda()
	checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
	base_dict = checkpoint['state_dict']
	model.load_state_dict(base_dict)
	model.eval()

	dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)

	cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)

	if opt["pptype"] == "nms":
	result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
	if opt["pptype"] == "net":
	result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
	output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
	outfile = open(opt["result_file"].format(opt['exp']), "w")
	json.dump(output_dict, outfile, indent=2)
	outfile.close()

	mAP = evaluation_detection(opt)
	return mAP

	def test_online(opt, video_name=None):
	model = MYNET(opt).cuda()
	checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
	base_dict = checkpoint['state_dict']
	model.load_state_dict(base_dict)
	model.eval()

	sup_model = SuppressNet(opt).cuda()
	checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
	base_dict = checkpoint['state_dict']
	sup_model.load_state_dict(base_dict)
	sup_model.eval()

	dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
	test_loader = torch.utils.data.DataLoader(dataset,
	batch_size=1, shuffle=False,
	num_workers=0, pin_memory=True, drop_last=False)

	result_dict = {}
	proposal_dict = []

	num_class = opt["num_of_class"]
	unit_size = opt['segment_size']
	threshold = opt['threshold']
	anchors = opt['anchors']

	start_time = time.time()
	total_frames = 0

	for video_name in dataset.video_list:
	input_queue = torch.zeros((unit_size, opt['feat_dim']))
	sup_queue = torch.zeros(((unit_size, num_class - 1)))

	duration = dataset.video_len[video_name]
	video_time = float(dataset.video_dict[video_name]["duration"])
	frame_to_time = 100.0 * video_time / duration

	for idx in range(0, duration):
	total_frames += 1
	input_queue[:-1, :] = input_queue[1:, :].clone()
	input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)

	minput = input_queue.unsqueeze(0)
	act_cls, act_reg, _ = model(minput.cuda())
	act_cls = torch.softmax(act_cls, dim=-1)

	cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
	reg_anc = act_reg.squeeze(0).detach().cpu().numpy()

	proposal_anc_dict = []
	for anc_idx in range(0, len(anchors)):
	cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)

	if len(cls) == 0:
	continue

	ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
	length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
	st = ed - length

	for cidx in range(0, len(cls)):
	label = cls[cidx]
	tmp_dict = {}
	tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
	tmp_dict["score"] = float(cls_anc[anc_idx][label])
	tmp_dict["label"] = dataset.label_name[label]
	tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
	proposal_anc_dict.append(tmp_dict)

	proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])

	sup_queue[:-1, :] = sup_queue[1:, :].clone()
	sup_queue[-1, :] = 0
	for proposal in proposal_anc_dict:
	cls_idx = dataset.label_name.index(proposal['label'])
	sup_queue[-1, cls_idx] = proposal["score"]

	minput = sup_queue.unsqueeze(0)
	suppress_conf = sup_model(minput.cuda())
	suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()

	for cls in range(0, num_class - 1):
	if suppress_conf[cls] > opt['sup_threshold']:
	for proposal in proposal_anc_dict:
	if proposal['label'] == dataset.label_name[cls]:
	if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
	proposal_dict.append(proposal)

	result_dict[video_name] = proposal_dict
	proposal_dict = []

	end_time = time.time()
	working_time = end_time - start_time
	print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))

	output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
	outfile = open(opt["result_file"].format(opt['exp']), "w")
	json.dump(output_dict, outfile, indent=2)
	outfile.close()

	mAP = evaluation_detection(opt)
	return mAP

	def main(opt, video_name=None):
	max_perf = 0
	if not video_name and 'video_name' in opt:
	video_name = opt['video_name']

	if opt['mode'] == 'train':
	max_perf = train(opt)
	if opt['mode'] == 'test':
	max_perf = test(opt, video_name=video_name)
	if opt['mode'] == 'test_frame':
	max_perf = test_frame(opt, video_name=video_name)
	if opt['mode'] == 'test_online':
	max_perf = test_online(opt, video_name=video_name)
	if opt['mode'] == 'eval':
	max_perf = evaluation_detection(opt)

	return max_perf

	if __name__ == '__main__':
	opt = opts.parse_opt()
	opt = vars(opt)
	if not os.path.exists(opt["checkpoint_path"]):
	os.makedirs(opt["checkpoint_path"])
	opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
	json.dump(opt, opt_file)
	opt_file.close()

	if opt['seed'] >= 0:
	seed = opt['seed']
	torch.manual_seed(seed)
	np.random.seed(seed)

	opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]

	video_name = opt.get('video_name', None)
	main(opt, video_name=video_name)
	while(opt['wterm']):
	pass