Xsmos commited on
Commit
53c31d2
·
verified ·
1 Parent(s): 50b223d
Files changed (2) hide show
  1. diffusion.py +2 -2
  2. phoenix_diffusion.sbatch +1 -1
diffusion.py CHANGED
@@ -71,7 +71,7 @@ from torch.distributed import init_process_group, destroy_process_group
71
  import torch.distributed as dist
72
 
73
  import argparse
74
-
75
 
76
  # %%
77
  def ddp_setup(rank: int, world_size: int, master_addr, master_port):
@@ -738,7 +738,7 @@ if __name__ == "__main__":
738
  ############################ training ################################
739
  world_size = torch.cuda.device_count()
740
  if args.train:
741
- print(f" training, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
742
  mp.spawn(
743
  train,
744
  args=(world_size, local_world_size, master_addr, master_port),
 
71
  import torch.distributed as dist
72
 
73
  import argparse
74
+ import socket
75
 
76
  # %%
77
  def ddp_setup(rank: int, world_size: int, master_addr, master_port):
 
738
  ############################ training ################################
739
  world_size = torch.cuda.device_count()
740
  if args.train:
741
+ print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
742
  mp.spawn(
743
  train,
744
  args=(world_size, local_world_size, master_addr, master_port),
phoenix_diffusion.sbatch CHANGED
@@ -2,7 +2,7 @@
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
- #SBATCH -N2 --gpus-per-node=RTX_6000:4 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --mem-per-gpu=32G # Memory per core
7
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
8
  #SBATCH -oReport-%j # Combined output and error messages file
 
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
+ #SBATCH -N4 --gpus-per-node=RTX_6000:2 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --mem-per-gpu=32G # Memory per core
7
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
8
  #SBATCH -oReport-%j # Combined output and error messages file