shawn

0fbed26 about 2 years ago

10.7 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import datetime
	import os
	import os.path as osp
	from collections import OrderedDict

	import torch
	import torch.distributed as dist

	import annotator.uniformer.mmcv as mmcv
	from annotator.uniformer.mmcv.fileio.file_client import FileClient
	from annotator.uniformer.mmcv.utils import is_tuple_of, scandir
	from ..hook import HOOKS
	from .base import LoggerHook


	@HOOKS.register_module()
	class TextLoggerHook(LoggerHook):
	"""Logger hook in text.

	In this logger hook, the information will be printed on terminal and
	saved in json file.

	Args:
	by_epoch (bool, optional): Whether EpochBasedRunner is used.
	Default: True.
	interval (int, optional): Logging interval (every k iterations).
	Default: 10.
	ignore_last (bool, optional): Ignore the log of last iterations in each
	epoch if less than :attr:`interval`. Default: True.
	reset_flag (bool, optional): Whether to clear the output buffer after
	logging. Default: False.
	interval_exp_name (int, optional): Logging interval for experiment
	name. This feature is to help users conveniently get the experiment
	information from screen or log file. Default: 1000.
	out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
	If ``out_dir`` is specified, logs will be copied to a new directory
	which is the concatenation of ``out_dir`` and the last level
	directory of ``runner.work_dir``. Default: None.
	`New in version 1.3.16.`
	out_suffix (str or tuple[str], optional): Those filenames ending with
	``out_suffix`` will be copied to ``out_dir``.
	Default: ('.log.json', '.log', '.py').
	`New in version 1.3.16.`
	keep_local (bool, optional): Whether to keep local log when
	:attr:`out_dir` is specified. If False, the local log will be
	removed. Default: True.
	`New in version 1.3.16.`
	file_client_args (dict, optional): Arguments to instantiate a
	FileClient. See :class:`mmcv.fileio.FileClient` for details.
	Default: None.
	`New in version 1.3.16.`
	"""

	def __init__(self,
	by_epoch=True,
	interval=10,
	ignore_last=True,
	reset_flag=False,
	interval_exp_name=1000,
	out_dir=None,
	out_suffix=('.log.json', '.log', '.py'),
	keep_local=True,
	file_client_args=None):
	super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
	by_epoch)
	self.by_epoch = by_epoch
	self.time_sec_tot = 0
	self.interval_exp_name = interval_exp_name

	if out_dir is None and file_client_args is not None:
	raise ValueError(
	'file_client_args should be "None" when `out_dir` is not'
	'specified.')
	self.out_dir = out_dir

	if not (out_dir is None or isinstance(out_dir, str)
	or is_tuple_of(out_dir, str)):
	raise TypeError('out_dir should be "None" or string or tuple of '
	'string, but got {out_dir}')
	self.out_suffix = out_suffix

	self.keep_local = keep_local
	self.file_client_args = file_client_args
	if self.out_dir is not None:
	self.file_client = FileClient.infer_client(file_client_args,
	self.out_dir)

	def before_run(self, runner):
	super(TextLoggerHook, self).before_run(runner)

	if self.out_dir is not None:
	self.file_client = FileClient.infer_client(self.file_client_args,
	self.out_dir)
	# The final `self.out_dir` is the concatenation of `self.out_dir`
	# and the last level directory of `runner.work_dir`
	basename = osp.basename(runner.work_dir.rstrip(osp.sep))
	self.out_dir = self.file_client.join_path(self.out_dir, basename)
	runner.logger.info(
	(f'Text logs will be saved to {self.out_dir} by '
	f'{self.file_client.name} after the training process.'))

	self.start_iter = runner.iter
	self.json_log_path = osp.join(runner.work_dir,
	f'{runner.timestamp}.log.json')
	if runner.meta is not None:
	self._dump_log(runner.meta, runner)

	def _get_max_memory(self, runner):
	device = getattr(runner.model, 'output_device', None)
	mem = torch.cuda.max_memory_allocated(device=device)
	mem_mb = torch.tensor([mem / (1024 * 1024)],
	dtype=torch.int,
	device=device)
	if runner.world_size > 1:
	dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
	return mem_mb.item()

	def _log_info(self, log_dict, runner):
	# print exp name for users to distinguish experiments
	# at every ``interval_exp_name`` iterations and the end of each epoch
	if runner.meta is not None and 'exp_name' in runner.meta:
	if (self.every_n_iters(runner, self.interval_exp_name)) or (
	self.by_epoch and self.end_of_epoch(runner)):
	exp_info = f'Exp name: {runner.meta["exp_name"]}'
	runner.logger.info(exp_info)

	if log_dict['mode'] == 'train':
	if isinstance(log_dict['lr'], dict):
	lr_str = []
	for k, val in log_dict['lr'].items():
	lr_str.append(f'lr_{k}: {val:.3e}')
	lr_str = ' '.join(lr_str)
	else:
	lr_str = f'lr: {log_dict["lr"]:.3e}'

	# by epoch: Epoch [4][100/1000]
	# by iter: Iter [100/100000]
	if self.by_epoch:
	log_str = f'Epoch [{log_dict["epoch"]}]' \
	f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
	else:
	log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
	log_str += f'{lr_str}, '

	if 'time' in log_dict.keys():
	self.time_sec_tot += (log_dict['time'] * self.interval)
	time_sec_avg = self.time_sec_tot / (
	runner.iter - self.start_iter + 1)
	eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
	eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
	log_str += f'eta: {eta_str}, '
	log_str += f'time: {log_dict["time"]:.3f}, ' \
	f'data_time: {log_dict["data_time"]:.3f}, '
	# statistic memory
	if torch.cuda.is_available():
	log_str += f'memory: {log_dict["memory"]}, '
	else:
	# val/test time
	# here 1000 is the length of the val dataloader
	# by epoch: Epoch[val] [4][1000]
	# by iter: Iter[val] [1000]
	if self.by_epoch:
	log_str = f'Epoch({log_dict["mode"]}) ' \
	f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
	else:
	log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'

	log_items = []
	for name, val in log_dict.items():
	# TODO: resolve this hack
	# these items have been in log_str
	if name in [
	'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
	'memory', 'epoch'
	]:
	continue
	if isinstance(val, float):
	val = f'{val:.4f}'
	log_items.append(f'{name}: {val}')
	log_str += ', '.join(log_items)

	runner.logger.info(log_str)

	def _dump_log(self, log_dict, runner):
	# dump log in json format
	json_log = OrderedDict()
	for k, v in log_dict.items():
	json_log[k] = self._round_float(v)
	# only append log at last line
	if runner.rank == 0:
	with open(self.json_log_path, 'a+') as f:
	mmcv.dump(json_log, f, file_format='json')
	f.write('\n')

	def _round_float(self, items):
	if isinstance(items, list):
	return [self._round_float(item) for item in items]
	elif isinstance(items, float):
	return round(items, 5)
	else:
	return items

	def log(self, runner):
	if 'eval_iter_num' in runner.log_buffer.output:
	# this doesn't modify runner.iter and is regardless of by_epoch
	cur_iter = runner.log_buffer.output.pop('eval_iter_num')
	else:
	cur_iter = self.get_iter(runner, inner_iter=True)

	log_dict = OrderedDict(
	mode=self.get_mode(runner),
	epoch=self.get_epoch(runner),
	iter=cur_iter)

	# only record lr of the first param group
	cur_lr = runner.current_lr()
	if isinstance(cur_lr, list):
	log_dict['lr'] = cur_lr[0]
	else:
	assert isinstance(cur_lr, dict)
	log_dict['lr'] = {}
	for k, lr_ in cur_lr.items():
	assert isinstance(lr_, list)
	log_dict['lr'].update({k: lr_[0]})

	if 'time' in runner.log_buffer.output:
	# statistic memory
	if torch.cuda.is_available():
	log_dict['memory'] = self._get_max_memory(runner)

	log_dict = dict(log_dict, **runner.log_buffer.output)

	self._log_info(log_dict, runner)
	self._dump_log(log_dict, runner)
	return log_dict

	def after_run(self, runner):
	# copy or upload logs to self.out_dir
	if self.out_dir is not None:
	for filename in scandir(runner.work_dir, self.out_suffix, True):
	local_filepath = osp.join(runner.work_dir, filename)
	out_filepath = self.file_client.join_path(
	self.out_dir, filename)
	with open(local_filepath, 'r') as f:
	self.file_client.put_text(f.read(), out_filepath)

	runner.logger.info(
	(f'The file {local_filepath} has been uploaded to '
	f'{out_filepath}.'))

	if not self.keep_local:
	os.remove(local_filepath)
	runner.logger.info(
	(f'{local_filepath} was removed due to the '
	'`self.keep_local=False`'))