| | """This script is the training script for Deep3DFaceRecon_pytorch |
| | """ |
| |
|
| | import os |
| | import time |
| | import numpy as np |
| | import torch |
| | from options.train_options import TrainOptions |
| | from data import create_dataset |
| | from deep_3drecon_models import create_model |
| | from util.visualizer import MyVisualizer |
| | from util.util import genvalconf |
| | import torch.multiprocessing as mp |
| | import torch.distributed as dist |
| |
|
| |
|
| | def setup(rank, world_size, port): |
| | os.environ['MASTER_ADDR'] = 'localhost' |
| | os.environ['MASTER_PORT'] = port |
| |
|
| | |
| | dist.init_process_group("gloo", rank=rank, world_size=world_size) |
| |
|
| | def cleanup(): |
| | dist.destroy_process_group() |
| |
|
| | def main(rank, world_size, train_opt): |
| | val_opt = genvalconf(train_opt, isTrain=False) |
| | |
| | device = torch.device(rank) |
| | torch.cuda.set_device(device) |
| | use_ddp = train_opt.use_ddp |
| | |
| | if use_ddp: |
| | setup(rank, world_size, train_opt.ddp_port) |
| |
|
| | train_dataset, val_dataset = create_dataset(train_opt, rank=rank), create_dataset(val_opt, rank=rank) |
| | train_dataset_batches, val_dataset_batches = \ |
| | len(train_dataset) // train_opt.batch_size, len(val_dataset) // val_opt.batch_size |
| | |
| | model = create_model(train_opt) |
| | model.setup(train_opt) |
| | model.device = device |
| | model.parallelize() |
| |
|
| | if rank == 0: |
| | print('The batch number of training images = %d\n, \ |
| | the batch number of validation images = %d'% (train_dataset_batches, val_dataset_batches)) |
| | model.print_networks(train_opt.verbose) |
| | visualizer = MyVisualizer(train_opt) |
| |
|
| | total_iters = train_dataset_batches * (train_opt.epoch_count - 1) |
| | t_data = 0 |
| | t_val = 0 |
| | optimize_time = 0.1 |
| | batch_size = 1 if train_opt.display_per_batch else train_opt.batch_size |
| |
|
| | if use_ddp: |
| | dist.barrier() |
| |
|
| | times = [] |
| | for epoch in range(train_opt.epoch_count, train_opt.n_epochs + 1): |
| | epoch_start_time = time.time() |
| | iter_data_time = time.time() |
| | epoch_iter = 0 |
| |
|
| | train_dataset.set_epoch(epoch) |
| | for i, train_data in enumerate(train_dataset): |
| | iter_start_time = time.time() |
| | if total_iters % train_opt.print_freq == 0: |
| | t_data = iter_start_time - iter_data_time |
| | total_iters += batch_size |
| | epoch_iter += batch_size |
| |
|
| | torch.cuda.synchronize() |
| | optimize_start_time = time.time() |
| |
|
| | model.set_input(train_data) |
| | model.optimize_parameters() |
| |
|
| | torch.cuda.synchronize() |
| | optimize_time = (time.time() - optimize_start_time) / batch_size * 0.005 + 0.995 * optimize_time |
| |
|
| | if use_ddp: |
| | dist.barrier() |
| |
|
| | if rank == 0 and (total_iters == batch_size or total_iters % train_opt.display_freq == 0): |
| | model.compute_visuals() |
| | visualizer.display_current_results(model.get_current_visuals(), total_iters, epoch, |
| | save_results=True, |
| | add_image=train_opt.add_image) |
| | |
| | |
| | if rank == 0 and (total_iters == batch_size or total_iters % train_opt.print_freq == 0): |
| | losses = model.get_current_losses() |
| | visualizer.print_current_losses(epoch, epoch_iter, losses, optimize_time, t_data) |
| | visualizer.plot_current_losses(total_iters, losses) |
| |
|
| | if total_iters == batch_size or total_iters % train_opt.evaluation_freq == 0: |
| | with torch.no_grad(): |
| | torch.cuda.synchronize() |
| | val_start_time = time.time() |
| | losses_avg = {} |
| | model.eval() |
| | for j, val_data in enumerate(val_dataset): |
| | model.set_input(val_data) |
| | model.optimize_parameters(isTrain=False) |
| | if rank == 0 and j < train_opt.vis_batch_nums: |
| | model.compute_visuals() |
| | visualizer.display_current_results(model.get_current_visuals(), total_iters, epoch, |
| | dataset='val', save_results=True, count=j * val_opt.batch_size, |
| | add_image=train_opt.add_image) |
| |
|
| | if j < train_opt.eval_batch_nums: |
| | losses = model.get_current_losses() |
| | for key, value in losses.items(): |
| | losses_avg[key] = losses_avg.get(key, 0) + value |
| |
|
| | for key, value in losses_avg.items(): |
| | losses_avg[key] = value / min(train_opt.eval_batch_nums, val_dataset_batches) |
| |
|
| | torch.cuda.synchronize() |
| | eval_time = time.time() - val_start_time |
| | |
| | if rank == 0: |
| | visualizer.print_current_losses(epoch, epoch_iter, losses_avg, eval_time, t_data, dataset='val') |
| | visualizer.plot_current_losses(total_iters, losses_avg, dataset='val') |
| | model.train() |
| |
|
| | if use_ddp: |
| | dist.barrier() |
| |
|
| | if rank == 0 and (total_iters == batch_size or total_iters % train_opt.save_latest_freq == 0): |
| | print('saving the latest model (epoch %d, total_iters %d)' % (epoch, total_iters)) |
| | print(train_opt.name) |
| | save_suffix = 'iter_%d' % total_iters if train_opt.save_by_iter else 'latest' |
| | model.save_networks(save_suffix) |
| | |
| | if use_ddp: |
| | dist.barrier() |
| | |
| | iter_data_time = time.time() |
| |
|
| | print('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, train_opt.n_epochs, time.time() - epoch_start_time)) |
| | model.update_learning_rate() |
| | |
| | if rank == 0 and epoch % train_opt.save_epoch_freq == 0: |
| | print('saving the model at the end of epoch %d, iters %d' % (epoch, total_iters)) |
| | model.save_networks('latest') |
| | model.save_networks(epoch) |
| |
|
| | if use_ddp: |
| | dist.barrier() |
| |
|
| | if __name__ == '__main__': |
| |
|
| | import warnings |
| | warnings.filterwarnings("ignore") |
| | |
| | train_opt = TrainOptions().parse() |
| | world_size = train_opt.world_size |
| |
|
| | if train_opt.use_ddp: |
| | mp.spawn(main, args=(world_size, train_opt), nprocs=world_size, join=True) |
| | else: |
| | main(0, world_size, train_opt) |
| |
|