#!/usr/bin/env python # # file: $ISIP_EXP/SOGMP/scripts/train.py # # revision history: xzt # 20220824 (TE): first version # # usage: # python train.py mdir train_data val_data # # arguments: # mdir: the directory where the output model is stored # train_data: the directory of training data # val_data: the directory of valiation data # # This script trains a Semantic CNN model #------------------------------------------------------------------------------ # import pytorch modules # import torch import torch.nn as nn from torch.optim import Adam from tqdm import tqdm # visualize: from tensorboardX import SummaryWriter import numpy as np # import the model and all of its variables/functions # from model import * # import modules # import sys import os #----------------------------------------------------------------------------- # # global variables are listed here # #----------------------------------------------------------------------------- # general global values # model_dir = './model/semantic_cnn_model.pth' # the path of model storage NUM_ARGS = 3 NUM_EPOCHS = 4000 BATCH_SIZE = 64 LEARNING_RATE = "lr" BETAS = "betas" EPS = "eps" WEIGHT_DECAY = "weight_decay" # for reproducibility, we seed the rng # set_seed(SEED1) # adjust_learning_rate #  def adjust_learning_rate(optimizer, epoch): lr = 1e-3 if epoch > 40: lr = 2e-4 if epoch > 2000: lr = 2e-5 if epoch > 21000: lr = 1e-5 if epoch > 32984: lr = 1e-6 if epoch > 48000: # lr = 5e-8 lr = lr * (0.1 ** (epoch // 110000)) # if epoch > 8300: # lr = 1e-9 for param_group in optimizer.param_groups: param_group['lr'] = lr # train function: def train(model, dataloader, dataset, device, optimizer, criterion, epoch, epochs): ################################## Train ##################################### # Set model to training mode model.train() # for each batch in increments of batch size # running_loss = 0 counter = 0 # get the number of batches (ceiling of train_data/batch_size): num_batches = int(len(dataset)/dataloader.batch_size) for i, batch in tqdm(enumerate(dataloader), total=num_batches): #for i, batch in enumerate(dataloader, 0): counter += 1 # collect the samples as a batch: scan_maps = batch['scan_map'] scan_maps = scan_maps.to(device) semantic_maps = batch['semantic_map'] semantic_maps = semantic_maps.to(device) sub_goals = batch['sub_goal'] sub_goals = sub_goals.to(device) velocities = batch['velocity'] velocities = velocities.to(device) # set all gradients to 0: optimizer.zero_grad() # feed the network the batch # output = model(scan_maps, semantic_maps, sub_goals) #writer.add_graph(model,[batch_ped_pos_t, batch_scan_t, batch_goal_t]) # get the loss # # loss = criterion(output, velocities) # --------------------------- # Mask zero-velocity samples # --------------------------- mask = (velocities != 0).any(dim=1) # (B,) if mask.sum() == 0: loss = output.sum() * 0 # safe zero loss else: loss = criterion(output[mask], velocities[mask]) # perform back propagation: loss.backward(torch.ones_like(loss)) optimizer.step() # get the loss: # multiple GPUs: if torch.cuda.device_count() > 1: loss = loss.mean() running_loss += loss.item() # display informational message # if(i % 1280 == 0): print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}' .format(epoch, epochs, i + 1, num_batches, loss.item())) train_loss = running_loss / len(dataset) #counter return train_loss # validate function: def validate(model, dataloader, dataset, device, criterion): ################################## Train ##################################### # set model to evaluation mode: model.eval() # for each batch in increments of batch size # running_loss = 0 counter = 0 # get the number of batches (ceiling of train_data/batch_size): num_batches = int(len(dataset)/dataloader.batch_size) for i, batch in tqdm(enumerate(dataloader), total=num_batches): #for i, batch in enumerate(dataloader, 0): counter += 1 # collect the samples as a batch: scan_maps = batch['scan_map'] scan_maps = scan_maps.to(device) semantic_maps = batch['semantic_map'] semantic_maps = semantic_maps.to(device) sub_goals = batch['sub_goal'] sub_goals = sub_goals.to(device) velocities = batch['velocity'] velocities = velocities.to(device) # feed the network the batch # output = model(scan_maps, semantic_maps, sub_goals) #writer.add_graph(model,[batch_ped_pos_t, batch_scan_t, batch_goal_t]) # get the loss # # loss = criterion(output, velocities) # --------------------------- # Mask zero-velocity samples # --------------------------- mask = (velocities != 0).any(dim=1) # (B,) if mask.sum() == 0: loss = output.sum() * 0 # safe zero loss else: loss = criterion(output[mask], velocities[mask]) # get the loss: # multiple GPUs: if torch.cuda.device_count() > 1: loss = loss.mean() running_loss += loss.item() val_loss = running_loss / len(dataset) #counter return val_loss #------------------------------------------------------------------------------ # # the main program starts here # #------------------------------------------------------------------------------ # function: main # # arguments: none # # return: none # # This method is the main function. # def main(argv): # ensure we have the correct amount of arguments # #global cur_batch_win if(len(argv) != NUM_ARGS): print("usage: python nedc_train_mdl.py [MDL_PATH] [TRAIN_PATH] [DEV_PATH]") exit(-1) # define local variables # mdl_path = argv[0] pTrain = argv[1] pDev = argv[2] # get the output directory name # odir = os.path.dirname(mdl_path) # if the odir doesn't exits, we make it # if not os.path.exists(odir): os.makedirs(odir) # set the device to use GPU if available # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ### train: print('...Start reading data...') # get array of the data # data: [[0, 1, ... 26], [27, 28, ...] ...] # labels: [0, 0, 1, ...] # #[ped_pos_t, scan_t, goal_t, vel_t] = get_data(pTrain) train_dataset = NavDataset(pTrain, 'train') train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, \ shuffle=True, drop_last=True, pin_memory=True) #train_data = train_data - np.mean(train_data, axis=0) ### dev: # get array of the data # data: [[0, 1, ... 26], [27, 28, ...] ...] # labels: [0, 0, 1, ...] # #[ped_pos_d, scan_d, goal_d, vel_d] = get_data(pDev) dev_dataset = NavDataset(pDev, 'dev') dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, \ shuffle=True, drop_last=True, pin_memory=True) #dev_data = dev_data - np.mean(dev_data, axis=0) print('...Finish reading data...') # instantiate a model # model = SemanticCNN(Bottleneck, [2, 1, 1]) # moves the model to device (cpu in our case so no change) # model.to(device) # set the adam optimizer parameters # opt_params = { LEARNING_RATE: 0.001, BETAS: (.9,0.999), EPS: 1e-08, WEIGHT_DECAY: .001 } # set the loss and optimizer # criterion = nn.MSELoss(reduction='sum') criterion.to(device) # create an optimizer, and pass the model params to it # optimizer = Adam(model.parameters(), **opt_params) # get the number of epochs to train on # epochs = NUM_EPOCHS # if there are trained models, continue training: if os.path.exists(mdl_path): checkpoint = torch.load(mdl_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] print('Load epoch {} success'.format(start_epoch)) else: start_epoch = 0 print('No trained models, restart training') # multiple GPUs: if torch.cuda.device_count() > 1: print("Let's use 2 of total", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model) #, device_ids=[0, 1]) # moves the model to device (cpu in our case so no change) # model.to(device) # tensorboard writer: writer = SummaryWriter('runs') # for each epoch # #loss_train = [] #loss_vector = [] epoch_num = 0 for epoch in range(start_epoch+1, epochs): # adjust learning rate: adjust_learning_rate(optimizer, epoch) ################################## Train ##################################### # for each batch in increments of batch size # train_epoch_loss = train( model, train_dataloader, train_dataset, device, optimizer, criterion, epoch, epochs ) ################################## Test ##################################### valid_epoch_loss = validate( model, dev_dataloader, dev_dataset, device, criterion ) # log the epoch loss writer.add_scalar('training loss', train_epoch_loss, epoch) writer.add_scalar('validation loss', valid_epoch_loss, epoch) print('Train set: Average loss: {:.4f}'.format(train_epoch_loss)) print('Validation set: Average loss: {:.4f}'.format(valid_epoch_loss)) # save the model # if(epoch % 50 == 0): if torch.cuda.device_count() > 1: # multiple GPUS: state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} else: state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} path='./model/model' + str(epoch) +'.pth' torch.save(state, path) epoch_num = epoch # save the final model if torch.cuda.device_count() > 1: # multiple GPUS: state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num} else: state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num} torch.save(state, mdl_path) # exit gracefully # return True # # end of function # begin gracefully # if __name__ == '__main__': main(sys.argv[1:]) # # end of file