20051216
Browse files- diffusion.py +4 -2
- perlmutter_diffusion.sbatch +3 -2
- quantify_results.ipynb +2 -2
diffusion.py
CHANGED
|
@@ -363,7 +363,7 @@ def get_gpu_info(device):
|
|
| 363 |
|
| 364 |
class DDPM21CM:
|
| 365 |
def __init__(self, config):
|
| 366 |
-
config.run_name = datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
|
| 367 |
self.config = config
|
| 368 |
self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
|
| 369 |
|
|
@@ -381,7 +381,7 @@ class DDPM21CM:
|
|
| 381 |
# print(f"resumed nn_model from {config.resume}")
|
| 382 |
self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
|
| 383 |
#self.nn_model.module.to(config.dtype)
|
| 384 |
-
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
|
| 385 |
else:
|
| 386 |
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
|
| 387 |
|
|
@@ -713,6 +713,7 @@ if __name__ == "__main__":
|
|
| 713 |
parser.add_argument("--autocast", type=int, required=False, default=False)
|
| 714 |
parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
|
| 715 |
parser.add_argument("--dropout", type=float, required=False, default=0)
|
|
|
|
| 716 |
|
| 717 |
args = parser.parse_args()
|
| 718 |
|
|
@@ -731,6 +732,7 @@ if __name__ == "__main__":
|
|
| 731 |
config.autocast = bool(args.autocast)
|
| 732 |
config.use_checkpoint = bool(args.use_checkpoint)
|
| 733 |
config.dropout = args.dropout
|
|
|
|
| 734 |
|
| 735 |
############################ training ################################
|
| 736 |
if args.train:
|
|
|
|
| 363 |
|
| 364 |
class DDPM21CM:
|
| 365 |
def __init__(self, config):
|
| 366 |
+
config.run_name = os.environ.get("SLURM_JOB_ID", datetime.now().strftime("%d%H%M%S")) # the unique name of each experiment
|
| 367 |
self.config = config
|
| 368 |
self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
|
| 369 |
|
|
|
|
| 381 |
# print(f"resumed nn_model from {config.resume}")
|
| 382 |
self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
|
| 383 |
#self.nn_model.module.to(config.dtype)
|
| 384 |
+
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
|
| 385 |
else:
|
| 386 |
print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
|
| 387 |
|
|
|
|
| 713 |
parser.add_argument("--autocast", type=int, required=False, default=False)
|
| 714 |
parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
|
| 715 |
parser.add_argument("--dropout", type=float, required=False, default=0)
|
| 716 |
+
parser.add_argument("--lrate", type=float, required=False, default=1e-4)
|
| 717 |
|
| 718 |
args = parser.parse_args()
|
| 719 |
|
|
|
|
| 732 |
config.autocast = bool(args.autocast)
|
| 733 |
config.use_checkpoint = bool(args.use_checkpoint)
|
| 734 |
config.dropout = args.dropout
|
| 735 |
+
config.lrate = args.lrate
|
| 736 |
|
| 737 |
############################ training ################################
|
| 738 |
if args.train:
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -35,10 +35,11 @@ srun python diffusion.py \
|
|
| 35 |
--num_new_img_per_gpu 4 \
|
| 36 |
--max_num_img_per_gpu 2 \
|
| 37 |
--gradient_accumulation_steps 1 \
|
| 38 |
-
--autocast
|
| 39 |
--use_checkpoint 1 \
|
| 40 |
--dropout 0.1 \
|
| 41 |
-
--
|
|
|
|
| 42 |
#--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 43 |
|
| 44 |
date
|
|
|
|
| 35 |
--num_new_img_per_gpu 4 \
|
| 36 |
--max_num_img_per_gpu 2 \
|
| 37 |
--gradient_accumulation_steps 1 \
|
| 38 |
+
--autocast 1 \
|
| 39 |
--use_checkpoint 1 \
|
| 40 |
--dropout 0.1 \
|
| 41 |
+
--lrate 7e-5 \
|
| 42 |
+
--resume ./outputs/model-N1600-device_count4-node4-epoch34-20051216 \
|
| 43 |
#--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 44 |
|
| 45 |
date
|
quantify_results.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8144cb4596999bacefce85cad17af66ea5ebfd5e11d2517361e365fce9895b45
|
| 3 |
+
size 15754478
|