Xsmos commited on
Commit
a0f5c0a
·
verified ·
1 Parent(s): 3ef3f4a

0726-1631

Browse files
Files changed (2) hide show
  1. diffusion.py +3 -2
  2. phoenix_diffusion.sbatch +1 -0
diffusion.py CHANGED
@@ -732,7 +732,8 @@ if __name__ == "__main__":
732
  parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
733
  args = parser.parse_args()
734
 
735
- master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
 
736
  master_port = "12355"
737
  world_size = int(os.environ["SLURM_NTASKS"])
738
  local_world_size = torch.cuda.device_count()
@@ -740,7 +741,7 @@ if __name__ == "__main__":
740
  ############################ training ################################
741
  #world_size = torch.cuda.device_count()
742
  if args.train:
743
- print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
744
  mp.spawn(
745
  train,
746
  args=(world_size, local_world_size, master_addr, master_port),
 
732
  parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
733
  args = parser.parse_args()
734
 
735
+ #master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
736
+ master_addr = os.environ.get("MASTER_ADDR", "localhost")
737
  master_port = "12355"
738
  world_size = int(os.environ["SLURM_NTASKS"])
739
  local_world_size = torch.cuda.device_count()
 
741
  ############################ training ################################
742
  #world_size = torch.cuda.device_count()
743
  if args.train:
744
+ print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
745
  mp.spawn(
746
  train,
747
  args=(world_size, local_world_size, master_addr, master_port),
phoenix_diffusion.sbatch CHANGED
@@ -3,6 +3,7 @@
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
  #SBATCH -N2 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
 
6
  #SBATCH --mem-per-gpu=32G # Memory per core
7
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
8
  #SBATCH -oReport-%j # Combined output and error messages file
 
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
  #SBATCH -N2 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
6
+ #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=32G # Memory per core
8
  #SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file