Xsmos commited on
Commit
8a03583
·
verified ·
1 Parent(s): b83a72b

0728-1727

Browse files
Files changed (2) hide show
  1. diffusion.py +18 -10
  2. phoenix_diffusion.sbatch +6 -0
diffusion.py CHANGED
@@ -73,8 +73,10 @@ import torch.distributed as dist
73
  import argparse
74
  import socket
75
  import sys
 
 
76
  # %%
77
- def ddp_setup(rank: int, world_size: int, local_world_size, master_addr, master_port):
78
  """
79
  Args:
80
  rank: Unique identifier of each process
@@ -89,7 +91,8 @@ def ddp_setup(rank: int, world_size: int, local_world_size, master_addr, master_
89
  backend="nccl",
90
  init_method=f"tcp://{master_addr}:{master_port}",
91
  rank=rank,
92
- world_size=world_size*local_world_size
 
93
  )
94
 
95
  # %%
@@ -659,15 +662,17 @@ class DDPM21CM:
659
  #num_train_image_list = [6000]#[60]#[8000]#[1000]#[100]#
660
  def train(rank, world_size, local_world_size, master_addr, master_port):
661
  #print("before ddp_setup")
662
- ddp_setup(rank, world_size, local_world_size, master_addr, master_port)
 
 
663
  #print("after ddp_setup")
664
- local_rank = rank % local_world_size
665
- torch.cuda.set_device(local_rank)
666
  #print("after set device")
667
- print(f"rank = {rank}, local_rank = {local_rank}, world_size = {world_size}, local_world_size = {local_world_size}")
668
 
669
  config = TrainConfig()
670
- config.device = f"cuda:{local_rank}"
671
  config.world_size = local_world_size
672
 
673
  #[3200]#[200]#[1600,3200,6400,12800,25600]
@@ -741,10 +746,13 @@ if __name__ == "__main__":
741
  args = parser.parse_args()
742
 
743
  #master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
744
- master_addr = os.environ.get("MASTER_ADDR", "localhost")
745
- master_port = "12355"
746
- world_size = 1#int(os.environ["SLURM_NTASKS"])
 
747
  local_world_size = torch.cuda.device_count()
 
 
748
 
749
  ############################ training ################################
750
  #world_size = torch.cuda.device_count()
 
73
  import argparse
74
  import socket
75
  import sys
76
+ from datetime import timedelta
77
+
78
  # %%
79
+ def ddp_setup(rank: int, world_size: int, master_addr, master_port):
80
  """
81
  Args:
82
  rank: Unique identifier of each process
 
91
  backend="nccl",
92
  init_method=f"tcp://{master_addr}:{master_port}",
93
  rank=rank,
94
+ world_size=world_size,
95
+ timeout=timedelta(minutes=4)
96
  )
97
 
98
  # %%
 
662
  #num_train_image_list = [6000]#[60]#[8000]#[1000]#[100]#
663
  def train(rank, world_size, local_world_size, master_addr, master_port):
664
  #print("before ddp_setup")
665
+ global_rank = rank + local_world_size * int(os.environ["SLURM_NODEID"])
666
+
667
+ ddp_setup(global_rank, world_size, master_addr, master_port)
668
  #print("after ddp_setup")
669
+ #local_rank = rank % local_world_size
670
+ torch.cuda.set_device(rank)
671
  #print("after set device")
672
+ print(f"rank = {rank}, global_rank = {global_rank}, world_size = {world_size}, local_world_size = {local_world_size}")
673
 
674
  config = TrainConfig()
675
+ config.device = f"cuda:{rank}"
676
  config.world_size = local_world_size
677
 
678
  #[3200]#[200]#[1600,3200,6400,12800,25600]
 
746
  args = parser.parse_args()
747
 
748
  #master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
749
+ #master_addr = os.environ.get("MASTER_ADDR", "localhost")
750
+ #master_port = "12355"
751
+ master_addr = os.environ["MASTER_ADDR"]
752
+ master_port = os.environ["MASTER_PORT"]
753
  local_world_size = torch.cuda.device_count()
754
+ total_nodes = int(os.environ["SLURM_NNODES"])
755
+ world_size = local_world_size * total_nodes #6#int(os.environ["SLURM_NTASKS"])
756
 
757
  ############################ training ################################
758
  #world_size = torch.cuda.device_count()
phoenix_diffusion.sbatch CHANGED
@@ -22,4 +22,10 @@ conda activate diffusers
22
  conda env list
23
  cat $0
24
 
 
 
 
 
 
 
25
  srun python diffusion.py --train 1 --sample 0
 
22
  conda env list
23
  cat $0
24
 
25
+ MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
26
+ MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
27
+
28
+ export MASTER_ADDR=$MASTER_ADDR
29
+ export MASTER_PORT=$MASTER_PORT
30
+
31
  srun python diffusion.py --train 1 --sample 0