unknownuser6666's picture
Upload folder using huggingface_hub
663494c verified
import logging
import os
import sys
from typing import Dict, List, Optional, Tuple, Union
import mmcv
import torch.distributed as dist
from mmcv.runner.hooks.hook import HOOKS, Hook
from mmcv.runner.checkpoint import save_checkpoint
# from mmdet3d_plugin.models.utils import run_time
@HOOKS.register_module()
class GradChecker(Hook):
def after_train_iter(self, runner):
for key, val in runner.model.named_parameters():
if val.grad == None and val.requires_grad:
print(
"WARNNING: {key}'s parameters are not be used!!!!".format(key=key)
)
@HOOKS.register_module()
class SamplerSkipIterationHook(Hook):
"""Data-loading sampler for distributed training.
When distributed training, it is only useful in conjunction with
:obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
purpose with :obj:`IterLoader`.
"""
def __init__(self, out_dir=None):
"""Init routine."""
self.out_dir = out_dir
def before_train_epoch(self, runner):
if hasattr(runner.data_loader.sampler, 'skip_iter_at_epoch_x'):
# in case the data loader uses `SequentialSampler` in Pytorch
runner.data_loader.sampler.skip_iter_at_epoch_x(runner._inner_iter)
elif hasattr(runner.data_loader.batch_sampler.sampler, 'skip_iter_at_epoch_x'):
# batch sampler in pytorch warps the sampler as its attributes.
runner.data_loader.batch_sampler.sampler.skip_iter_at_epoch_x(runner._inner_iter)
_logger = logging.getLogger("autoresume_hook")
sys.path.append(os.environ.get("SUBMIT_SCRIPTS", "."))
try:
_logger.info("Importing AutoResume lib...")
from userlib.auto_resume import AutoResume
AutoResume.init()
_logger.info("Found AutoResume SDK!")
except:
_logger.info("Did not find AutResume SDK!")
AutoResume = None
@HOOKS.register_module()
class AutoResumeHook(Hook):
"""AutoResume hook.
A hook to interface with ADLR's AutoResume SDK.
In order to use this hook, you must first import the AutoResume SDK
in the main training script:
sys.path.append(os.environ.get("SUBMIT_SCRIPTS", "."))
try:
_logger.info("Importing AutoResume lib...")
from userlib.auto_resume import AutoResume
AutoResume.init()
_logger.info("Success!")
except:
_logger.info("Failed!")
AutoResume = None
Also make sure you import the code for the auto-resume hook:
import autoresume_hook
In the main initialization routine, set the `resume` flag
in the MMCV configure depending on whether the job is being resumed:
if AutoResume is not None:
auto_resume_details = AutoResume.get_resume_details()
if auto_resume_details is not None:
print_log(f"AutoResume details: {auto_resume_details}")
cfg.resume = True
Finally, in your MMSEG config, add the following statements:
# Hook for auto-suspend/resume on ADLR clusters.
custom_hooks = [dict(type="AutoResumeHook", interval=2000)]
Args:
interval: interval (in number of iterations) between checks as to
whether to suspend.
"""
def __init__(self, interval: int = 1000, out_dir=None):
"""Init routine."""
self.interval = interval
self.out_dir = out_dir
def every_n_train_iters(self, runner, n):
# print(f"epoch {runner.epoch}")
# print(f"iter {runner.iter}")
# print(f"inner_iter {runner.inner_iter}")
return runner.iter % n == 0 if n > 0 else False
def after_train_iter(
self,
runner: mmcv.runner.Runner,
# batch_idx: int,
# data_batch: Optional[Union[Dict, Tuple, List]] = None,
# outputs: Optional[Dict] = None,
# **kwargs,
) -> None:
"""Training iteration post hook.
Args:
runner: The runner of the training process.
batch_idx: The index of the current batch in the train loop.
data_batch: Data from dataloader.
outputs: Outputs from model.
"""
if self.every_n_train_iters(runner, self.interval):
if dist.is_initialized():
global_rank = dist.get_rank()
else:
global_rank = 0
runner.logger.info("AutoResumeHook: Checking whether to suspend...")
# Check whether to suspend the job.
should_preempt = (
AutoResume is not None and AutoResume.termination_requested()
)
if should_preempt:
# if suspend, first save the checkpoint
# meta has 'env_info', 'config', 'seed', 'exp_name'
# if runner.meta is not None:
# filename_tmpl = f"iter_{runner.iter + 1}.pth"
# else:
# filename_tmpl = f"iter_latest.pth"
# epoch -1 because it automatically add 1 when saving
# https://mmcv.readthedocs.io/en/master/_modules/mmcv/runner/epoch_based_runner.html
# runner._epoch = runner._epoch - 1
# print(f"saving epoch {runner.epoch}")
meta = {
'epoch': runner.epoch,
'iter': runner.iter,
'inner_iter': runner.inner_iter,
}
print(f"saving info {meta}")
# saving
runner.logger.info(f"refresh the latest_iter.pth")
filename = f"iter_{runner.iter+1:010d}_epoch_{runner.epoch:04d}_inneriter_{runner.inner_iter+1:08d}.pth"
# runner.save_checkpoint(
# self.out_dir,
# filename_tmpl=filename_tmpl,
# save_optimizer=True,
# create_symlink=False,
# meta=meta,
# )
# filename = filename_tmpl.format(self.epoch + 1)
filepath = os.path.join(self.out_dir, filename)
# optimizer = if save_optimizer else None
save_checkpoint(runner.model, filepath, optimizer=runner.optimizer, meta=meta)
# create customized symbolic link
dst_file = os.path.join(self.out_dir, 'latest_iter.pth')
mmcv.symlink(filename, dst_file)
# stop the program
if global_rank == 0:
runner.logger.info(f"AutoResumeHook: Request resume...")
AutoResume.request_resume()
runner.logger.info(f"AutoResumeHook: Suspend the job...")
sys.exit(0)