S23DR-P2R / train.py
colin1842's picture
add model
8d5039c
import os
import torch
import torch.nn as nn
import argparse
import datetime
import glob
import torch.distributed as dist
from dataset.data_utils import build_dataloader
from train_utils import train_model
from model.roofnet import RoofNet
from torch import optim
from utils import common_utils
from model import model_utils
import wandb
def get_scheduler(optim, last_epoch):
scheduler = torch.optim.lr_scheduler.StepLR(optim, 20, 0.5, last_epoch=last_epoch)
return scheduler
def parse_config():
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default='Data/hoho_data_train', help='dataset path')
parser.add_argument('--cfg_file', type=str, default='./model_cfg.yaml', help='model config for training')
parser.add_argument('--batch_size', type=int, default=256, help='batch size for training')
parser.add_argument('--gpu', type=str, default='0', help='gpu for training')
parser.add_argument('--extra_tag', type=str, default='hoho_train', help='extra tag for this experiment')
parser.add_argument('--epochs', type=int, default=120, help='number of epochs to train for')
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
args = parser.parse_args()
cfg = common_utils.cfg_from_yaml_file(args.cfg_file)
return args, cfg
def main():
args, cfg = parse_config()
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
# Initialize wandb
wandb.init(project="roofnet_training", config={
"data_path": args.data_path,
"cfg_file": args.cfg_file,
"batch_size": args.batch_size,
"gpu": args.gpu,
"extra_tag": args.extra_tag,
"epochs": args.epochs,
"learning_rate": args.lr,
})
extra_tag = args.extra_tag if args.extra_tag is not None \
else 'model-%s' % datetime.datetime.now().strftime('%Y%m%d')
output_dir = cfg.ROOT_DIR / 'output' / extra_tag
output_dir.mkdir(parents=True, exist_ok=True)
ckpt_dir = output_dir / 'ckpt'
ckpt_dir.mkdir(parents=True, exist_ok=True)
log_file = output_dir / 'log.txt'
logger = common_utils.create_logger(log_file)
logger.info('**********************Start logging**********************')
train_loader = build_dataloader(args.data_path, args.batch_size, cfg.DATA, training=True, logger=logger)
net = RoofNet(cfg.MODEL)
# net.cuda()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=1e-3)
start_epoch = it = 0
last_epoch = -1
ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
if len(ckpt_list) > 0:
ckpt_list.sort(key=os.path.getmtime)
it, start_epoch = model_utils.load_params_with_optimizer(
net, ckpt_list[-1], optimizer=optimizer, logger=logger
)
last_epoch = start_epoch + 1
scheduler = get_scheduler(optimizer, last_epoch=last_epoch)
net = net.train()
logger.info('**********************Start training**********************')
#logger.info(net)
train_model(net, optimizer, train_loader, scheduler, it, start_epoch, args.epochs, ckpt_dir)
if __name__ == '__main__':
main()