Xsmos commited on
Commit
25f1942
·
verified ·
1 Parent(s): 20b799b

0728-1034

Browse files
Files changed (2) hide show
  1. diffusion.py +3 -3
  2. phoenix_diffusion.sbatch +1 -1
diffusion.py CHANGED
@@ -651,12 +651,12 @@ class DDPM21CM:
651
 
652
  #num_train_image_list = [6000]#[60]#[8000]#[1000]#[100]#
653
  def train(rank, world_size, local_world_size, master_addr, master_port):
654
- #print("before ddp_setup")
655
  ddp_setup(rank, world_size, local_world_size, master_addr, master_port)
656
- #print("after ddp_setup")
657
  local_rank = rank % local_world_size
658
  torch.cuda.set_device(local_rank)
659
- #print("after set device")
660
  print(f"rank = {rank}, local_rank = {local_rank}, world_size = {world_size}, local_world_size = {local_world_size}")
661
 
662
  config = TrainConfig()
 
651
 
652
  #num_train_image_list = [6000]#[60]#[8000]#[1000]#[100]#
653
  def train(rank, world_size, local_world_size, master_addr, master_port):
654
+ print("before ddp_setup")
655
  ddp_setup(rank, world_size, local_world_size, master_addr, master_port)
656
+ print("after ddp_setup")
657
  local_rank = rank % local_world_size
658
  torch.cuda.set_device(local_rank)
659
+ print("after set device")
660
  print(f"rank = {rank}, local_rank = {local_rank}, world_size = {world_size}, local_world_size = {local_world_size}")
661
 
662
  config = TrainConfig()
phoenix_diffusion.sbatch CHANGED
@@ -2,7 +2,7 @@
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
- #SBATCH -N1 --gpus-per-node=RTX_6000:4 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=32G # Memory per core
8
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
 
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
+ #SBATCH -N1 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=32G # Memory per core
8
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)