Xsmos commited on
Commit
876f008
·
verified ·
1 Parent(s): 25f1942
Files changed (2) hide show
  1. diffusion.py +17 -10
  2. phoenix_diffusion.sbatch +1 -1
diffusion.py CHANGED
@@ -75,15 +75,22 @@ import socket
75
  import sys
76
  # %%
77
  def ddp_setup(rank: int, world_size: int, local_world_size, master_addr, master_port):
78
- """
79
- Args:
80
- rank: Unique identifier of each process
81
- world_size: Total number of processes
82
- """
83
- os.environ["MASTER_ADDR"] = master_addr
84
- os.environ["MASTER_PORT"] = master_port
85
- # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!ddp_setup, rank =", rank)
86
- init_process_group(backend="nccl", rank=rank, world_size=world_size*local_world_size)
 
 
 
 
 
 
 
87
 
88
  # %%
89
  # notebook_login()
@@ -736,7 +743,7 @@ if __name__ == "__main__":
736
  #master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
737
  master_addr = os.environ.get("MASTER_ADDR", "localhost")
738
  master_port = "12355"
739
- world_size = int(os.environ["SLURM_NTASKS"])
740
  local_world_size = torch.cuda.device_count()
741
 
742
  ############################ training ################################
 
75
  import sys
76
  # %%
77
  def ddp_setup(rank: int, world_size: int, local_world_size, master_addr, master_port):
78
+ """
79
+ Args:
80
+ rank: Unique identifier of each process
81
+ world_size: Total number of processes
82
+ """
83
+
84
+ print("inside ddp_setup")
85
+ os.environ["MASTER_ADDR"] = master_addr
86
+ os.environ["MASTER_PORT"] = master_port
87
+ print("ddp_setup, rank =", rank)
88
+ init_process_group(
89
+ backend="nccl",
90
+ init_method=f"tcp://{master_addr}:{master_port}",
91
+ rank=rank,
92
+ world_size=world_size*local_world_size
93
+ )
94
 
95
  # %%
96
  # notebook_login()
 
743
  #master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
744
  master_addr = os.environ.get("MASTER_ADDR", "localhost")
745
  master_port = "12355"
746
+ world_size = 1#int(os.environ["SLURM_NTASKS"])
747
  local_world_size = torch.cuda.device_count()
748
 
749
  ############################ training ################################
phoenix_diffusion.sbatch CHANGED
@@ -2,7 +2,7 @@
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
- #SBATCH -N1 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=32G # Memory per core
8
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
 
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
+ #SBATCH -N2 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=32G # Memory per core
8
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)