Upload folder using huggingface_hub

5f0437a verified 16 days ago

8.31 kB

	# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	# Copyright (c) 2023 Image Processing Research Group of University Federico II of Naples ('GRIP-UNINA').
	#
	# All rights reserved.
	# This work should only be used for nonprofit purposes.
	#
	# By downloading and/or using any of these files, you implicitly agree to all the
	# terms of the license, as specified in the document LICENSE.txt
	# (included in this package) and online at
	# http://www.grip.unina.it/download/LICENSE_OPEN.txt

	"""
	Created in September 2022
	@author: fabrizio.guillaro
	"""

	import sys, os
	path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
	if path not in sys.path:
	sys.path.insert(0, path)

	import argparse

	import logging
	import time
	import timeit

	import gc
	import numpy as np

	import torch
	import torch.backends.cudnn as cudnn
	import torch.optim
	torch.autograd.set_detect_anomaly(True)
	from tensorboardX import SummaryWriter

	from lib.config import config, update_config
	from lib.core.function import train, validate
	from lib.utils import get_model, get_optimizer
	from lib.utils import create_logger, FullModel, adjust_learning_rate

	from dataset.data_core import myDataset
	import albumentations


	def main():
	parser = argparse.ArgumentParser(description='Train TruFor')
	parser.add_argument('-exp', '--experiment', type=str)
	parser.add_argument('-g', '--gpu', type=int, default=[0], nargs="+", help='device(s)')
	parser.add_argument('opts', help='other options', default=None, nargs=argparse.REMAINDER)
	args = parser.parse_args()

	os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu)
	args.gpu = range(len(args.gpu))

	update_config(config, args)

	logger, final_output_dir, tb_log_dir = create_logger(config, f'{args.experiment}', 'train')
	logger.info(config)
	logger.info('\n')

	# cudnn setting
	cudnn.benchmark = config.CUDNN.BENCHMARK
	cudnn.deterministic = config.CUDNN.DETERMINISTIC
	cudnn.enabled = config.CUDNN.ENABLED

	gpus = list(config.GPUS)

	writer_dict = {
	'writer': SummaryWriter(tb_log_dir),
	'train_global_steps': 0,
	'valid_global_steps': 0,
	}

	if config.TRAIN.AUG is not None:
	aug_train = albumentations.load(config.TRAIN.AUG, data_format='yaml')
	else:
	aug_train = None

	if config.VALID.AUG is not None:
	aug_valid = albumentations.load(config.VALID.AUG, data_format='yaml')
	else:
	aug_valid = None

	logger.info(f'Train augmentation: {config.TRAIN.AUG} {aug_train}')
	logger.info(f'Validation augmentation: {config.VALID.AUG} {aug_valid}')

	crop_size = (config.TRAIN.IMAGE_SIZE[1], config.TRAIN.IMAGE_SIZE[0])
	train_dataset = myDataset(config, crop_size=crop_size, grid_crop=False, mode='train', aug=aug_train)
	valid_dataset = myDataset(config, crop_size=None, grid_crop=False, mode="valid", aug=aug_valid,
	max_dim=config.VALID.MAX_SIZE)

	trainloader = torch.utils.data.DataLoader(
	train_dataset,
	batch_size = config.TRAIN.BATCH_SIZE_PER_GPU*len(gpus),
	shuffle = config.TRAIN.SHUFFLE,
	num_workers = config.WORKERS)

	validloader = torch.utils.data.DataLoader(
	valid_dataset,
	batch_size = 1, # 1 to allow arbitrary input sizes
	shuffle = False, # must be False to get accurate filename
	num_workers = config.WORKERS)

	# model
	model = get_model(config)
	model = torch.nn.DataParallel(model, device_ids=gpus).cuda()
	model = FullModel(model, config)

	# optimizer
	optimizer = get_optimizer(model, config)

	epoch_iters = np.int32(train_dataset.__len__() / config.TRAIN.BATCH_SIZE_PER_GPU / len(gpus))

	best_key = config.VALID.BEST_KEY
	if 'loss' in best_key:
	best_value = np.inf
	else:
	best_value = 0
	logger.info(f'best valid key: {best_key}')


	last_epoch = 0
	if not config.TRAIN.PRETRAINING == '' and not config.TRAIN.PRETRAINING == None:
	model_state_file = config.TRAIN.PRETRAINING
	assert os.path.isfile(model_state_file)
	checkpoint = torch.load(model_state_file, map_location=lambda storage, loc: storage)
	state_dict = checkpoint['state_dict']
	try:
	model.model.module.load_state_dict(state_dict, strict=False)
	except:
	state_dict = {k: state_dict[k] for k in state_dict if not k.startswith('detection')}
	model.model.module.load_state_dict(state_dict, strict=False)
	del checkpoint
	del state_dict
	logger.info("=> loaded pretraining ({})".format(model_state_file))


	if config.TRAIN.RESUME:
	model_state_file = os.path.join(final_output_dir, 'checkpoint.pth.tar')
	if os.path.isfile(model_state_file):
	checkpoint = torch.load(model_state_file, map_location=lambda storage, loc: storage)
	best_value = checkpoint['best_value']
	assert checkpoint['best_key']==best_key
	last_epoch = checkpoint['epoch']
	model.model.module.load_state_dict(checkpoint['state_dict'])
	optimizer.load_state_dict(checkpoint['optimizer'])
	logger.info("=> loaded checkpoint (epoch {})".format(checkpoint['epoch']))
	writer_dict['train_global_steps'] = last_epoch
	else:
	logger.info("No previous checkpoint.")


	end_epoch = config.TRAIN.END_EPOCH + config.TRAIN.EXTRA_EPOCH
	num_iters = config.TRAIN.END_EPOCH * epoch_iters
	start_epoch = last_epoch
	if config.VALID.FIRST_VALID:
	start_epoch = start_epoch -1

	for epoch in range(start_epoch, end_epoch):
	# train
	if epoch>=last_epoch:
	train_dataset.shuffle() # for class-balanced sampling

	print(f'TRAINING epoch {epoch}:')
	train(epoch, config.TRAIN.END_EPOCH,
	epoch_iters, config.TRAIN.LR, num_iters,
	trainloader, optimizer, model, writer_dict,
	adjust_learning_rate=adjust_learning_rate)

	torch.cuda.empty_cache()
	gc.collect()
	time.sleep(1.0)

	logger.info('=> saving checkpoint to {}'.format(
	os.path.join(final_output_dir, 'checkpoint.pth.tar')))
	torch.save({
	'epoch': epoch + 1,
	'best_value': best_value,
	'best_key': best_key,
	'state_dict': model.model.module.state_dict(),
	'optimizer': optimizer.state_dict(),
	}, os.path.join(final_output_dir, 'checkpoint.pth.tar'))


	# valid
	print(f'VALIDATION epoch {epoch}:')
	writer_dict['valid_global_steps'] = epoch

	value_valid, IoU_array, confusion_matrix = \
	validate(config, validloader, model, writer_dict, "valid")

	torch.cuda.empty_cache()
	gc.collect()
	time.sleep(3.0)

	if 'loss' in best_key:
	if value_valid[best_key] < best_value: # smallest loss
	best_value = value_valid[best_key]
	torch.save({
	'epoch': epoch + 1,
	'best_value': best_value,
	'best_key': best_key,
	'state_dict': model.model.module.state_dict(),
	'optimizer': optimizer.state_dict(),
	}, os.path.join(final_output_dir, 'best.pth.tar'))
	logger.info("best.pth.tar updated.")

	elif value_valid[best_key] > best_value: # highest metric
	best_value = value_valid[best_key]
	torch.save({
	'epoch': epoch + 1,
	'best_value': best_value,
	'best_key': best_key,
	'state_dict': model.model.module.state_dict(),
	'optimizer': optimizer.state_dict(),
	}, os.path.join(final_output_dir, 'best.pth.tar'))
	logger.info("best.pth.tar updated.")

	msg = '(Valid) Loss: {:.3f}, Best_{:s}: {: 4.4f}'.format(
	value_valid['loss'], best_key, best_value)
	logging.info(msg)
	logging.info(IoU_array)
	logging.info("confusion_matrix:")
	logging.info(confusion_matrix)




	if __name__ == '__main__':
	main()