Xsmos
/

ml21cm

TensorBoard

generate 21cm lightcones

denoising diffusion probabilistic model

Model card Files Files and versions

xet

Metrics Training metrics Community

Xsmos commited on Jul 19, 2024

Commit

9f265ee

verified ·

1 Parent(s): 34e5bce

0718-2149

Browse files

Files changed (2) hide show

diffusion.py +9 -9
learn_multi_node.py +47 -0

diffusion.py CHANGED Viewed

@@ -235,11 +235,11 @@ class TrainConfig:
     # repeat = 2
     # dim = 2
-    dim = 3
     stride = (2,2) if dim == 2 else (2,2,4)
     num_image = 1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
-    batch_size = 1#2#50#20#2#100 # 10
-    n_epoch = 2#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     num_redshift = 512#128#64#512#256#256#64#512#128
     channel = 1
@@ -499,7 +499,7 @@ class DDPM21CM:
                             'unet_state_dict': self.nn_model.module.state_dict(),
                             # 'ema_unet_state_dict': self.ema_model.state_dict(),
                             }
-                        save_name = self.config.save_name+f"-N{self.config.num_image}-epoch{ep}-device{torch.cuda.current_device()}"
                         torch.save(model_state, save_name)
                         print(f'device {torch.cuda.current_device()} saved model at ' + save_name)
                         # print('saved model at ' + config.save_dir + f"model_epoch_{ep}_test_{config.run_name}.pth")
@@ -586,7 +586,7 @@ def train(rank, world_size):
     ddp_setup(rank, world_size)
-    num_train_image_list = [3200]#[3200]#[200]#[1600,3200,6400,12800,25600]
     for i, num_image in enumerate(num_train_image_list):
         config.num_image = num_image
         # config.world_size = world_size
@@ -677,9 +677,9 @@ if __name__ == "__main__":
     world_size = torch.cuda.device_count()
     print(f" sampling, world_size = {world_size} ".center(100,'-'))
     # num_train_image_list = [1600,3200,6400,12800,25600]
-    num_train_image_list = [3200]
-    num_new_img_per_gpu = 9
-    max_num_img_per_gpu = 1
     params = torch.tensor([4.4, 131.341])
@@ -690,7 +690,7 @@ if __name__ == "__main__":
     for num_image in num_train_image_list:
         config.num_image = num_image
-        config.resume = f"./outputs/model_state-N{num_image}-epoch6-device0"
         # print("ddpm21cm = DDPM21CM(config)")
         manager = mp.Manager()

     # repeat = 2
     # dim = 2
+    dim = 2
     stride = (2,2) if dim == 2 else (2,2,4)
     num_image = 1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
+    batch_size = 10#1#2#50#20#2#100 # 10
+    n_epoch = 5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     num_redshift = 512#128#64#512#256#256#64#512#128
     channel = 1
                             'unet_state_dict': self.nn_model.module.state_dict(),
                             # 'ema_unet_state_dict': self.ema_model.state_dict(),
                             }
+                        save_name = self.config.save_name+f"-N{self.config.num_image}-epoch{ep}-device_count{torch.cuda.device_count()}"
                         torch.save(model_state, save_name)
                         print(f'device {torch.cuda.current_device()} saved model at ' + save_name)
                         # print('saved model at ' + config.save_dir + f"model_epoch_{ep}_test_{config.run_name}.pth")
     ddp_setup(rank, world_size)
+    num_train_image_list = [400]#[3200]#[200]#[1600,3200,6400,12800,25600]
     for i, num_image in enumerate(num_train_image_list):
         config.num_image = num_image
         # config.world_size = world_size
     world_size = torch.cuda.device_count()
     print(f" sampling, world_size = {world_size} ".center(100,'-'))
     # num_train_image_list = [1600,3200,6400,12800,25600]
+    num_train_image_list = [400]
+    num_new_img_per_gpu = 40
+    max_num_img_per_gpu = 20
     params = torch.tensor([4.4, 131.341])
     for num_image in num_train_image_list:
         config.num_image = num_image
+        config.resume = f"./outputs/model_state-N{num_image}-epoch{config.n_epoch-1}-device_count{torch.cuda.device_count()}"
         # print("ddpm21cm = DDPM21CM(config)")
         manager = mp.Manager()

learn_multi_node.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'  # Replace with master node's IP
+    os.environ['MASTER_PORT'] = '12355'
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+def cleanup():
+    dist.destroy_process_group()
+class MyDiffusionModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(3, 16, 3, 1)
+        # self.conv2 = torch.nn.Conv2d(16, 32, 3, 1)
+        self.fc1 = torch.nn.Linear(32 * 6 * 6, 128)
+        # self.fc2 = torch.nn.Linear(128, 10)
+    def forward(self, x):
+        x = torch.nn.functional.relu(self.conv1(x))
+        x = torch.nn.functional.max_pool2d(x, 2)
+        # x = torch.nn.functional.relu(self.conv2(x))
+        # x = torch.nn.functional.max_pool2d(x, 2)
+        x = torch.flatten(x, 1)
+        x = torch.nn.functional.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+def main(rank, world_size):
+    setup(rank, world_size)
+    if torch.cuda.is_available():
+        num_gpus = torch.cuda.device_count()
+        print(f"Rank {rank}, Number of GPUs available: {num_gpus}")
+        for i in range(num_gpus):
+            print(f"Rank {rank}, GPU {i}: {torch.cuda.get_device_name(i)}")
+    else:
+        print(f"Rank {rank}, No GPUs available")
+    cleanup()
+if __name__ == "__main__":
+    world_size = 1  # Number of nodes
+    mp.spawn(main, args=(world_size,), nprocs=torch.cuda.device_count(), join=True)