| import argparse |
| import os |
| import sys |
| import torch as th |
| from torch.utils.data.distributed import DistributedSampler |
| from torch.nn.parallel import DistributedDataParallel as DDP |
|
|
| |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
| from guided_diffusion import dist_util, logger |
| from guided_diffusion.resample import create_named_schedule_sampler |
| from guided_diffusion.custom_lidc_dataset import CustomLIDCDataset |
| from guided_diffusion.script_util import ( |
| model_and_diffusion_defaults, |
| create_model_and_diffusion, |
| args_to_dict, |
| add_dict_to_argparser, |
| ) |
| from guided_diffusion.train_util import TrainLoop |
|
|
| def main(): |
| args = create_argparser().parse_args() |
|
|
| |
| dist_util.setup_dist() |
| local_rank = int(os.environ["LOCAL_RANK"]) |
| th.cuda.set_device(local_rank) |
|
|
| if dist_util.get_rank() == 0: |
| os.makedirs(args.log_dir, exist_ok=True) |
| logger.configure(dir=args.log_dir) |
|
|
| logger.log("creating model, diffusion, prior and posterior distribution...") |
| model, diffusion, prior, posterior = create_model_and_diffusion( |
| **args_to_dict(args, model_and_diffusion_defaults().keys()) |
| ) |
| model.to(dist_util.dev()) |
| prior.to(dist_util.dev()) |
| posterior.to(dist_util.dev()) |
|
|
| model = DDP( |
| model, |
| device_ids=[local_rank], |
| output_device=local_rank, |
| broadcast_buffers=False, |
| bucket_cap_mb=128, |
| find_unused_parameters=False, |
| ) |
|
|
| schedule_sampler = create_named_schedule_sampler(args.schedule_sampler, diffusion, maxt=1000) |
|
|
| logger.log("creating data loader...") |
| dataset = CustomLIDCDataset( |
| data_root=args.data_dir, |
| split="train", |
| image_size=args.image_size, |
| dataset_type=args.dataset_type, |
| split_strategy=args.split_strategy, |
| ) |
| |
| sampler = DistributedSampler( |
| dataset, |
| num_replicas=dist_util.get_world_size(), |
| rank=dist_util.get_rank(), |
| shuffle=True |
| ) |
| |
| dataloader = th.utils.data.DataLoader( |
| dataset, |
| batch_size=args.batch_size, |
| sampler=sampler, |
| num_workers=args.num_workers, |
| pin_memory=True, |
| drop_last=True |
| ) |
|
|
| logger.log("training...") |
| TrainLoop( |
| model=model, |
| diffusion=diffusion, |
| classifier=None, |
| prior=prior, |
| posterior=posterior, |
| data=iter(dataloader), |
| dataloader=dataloader, |
| batch_size=args.batch_size, |
| microbatch=args.microbatch, |
| lr=args.lr, |
| ema_rate=args.ema_rate, |
| log_interval=args.log_interval, |
| save_interval=args.save_interval, |
| resume_checkpoint=args.resume_checkpoint, |
| use_fp16=args.use_fp16, |
| fp16_scale_growth=args.fp16_scale_growth, |
| schedule_sampler=schedule_sampler, |
| weight_decay=args.weight_decay, |
| lr_anneal_steps=args.lr_anneal_steps, |
| |
| total_steps=args.total_steps, |
| ).run_loop() |
|
|
| def create_argparser(): |
| defaults = dict( |
| data_dir="./data/LIDC", |
| dataset_type="lidc", |
| split_strategy="all_annotations", |
| log_dir="./results/training_logs", |
| schedule_sampler="uniform", |
| lr=1e-4, |
| weight_decay=0.0, |
| lr_anneal_steps=0, |
| batch_size=4, |
| microbatch=-1, |
| ema_rate="0.9999", |
| log_interval=100, |
| save_interval=5000, |
| resume_checkpoint="", |
| use_fp16=False, |
| fp16_scale_growth=1e-3, |
| num_workers=4, |
| |
| total_steps=50000, |
| ) |
| defaults.update(model_and_diffusion_defaults()) |
| parser = argparse.ArgumentParser() |
| add_dict_to_argparser(parser, defaults) |
| return parser |
|
|
| if __name__ == "__main__": |
| main() |
|
|