0728-1034
Browse files- diffusion.py +3 -3
- phoenix_diffusion.sbatch +1 -1
diffusion.py
CHANGED
|
@@ -651,12 +651,12 @@ class DDPM21CM:
|
|
| 651 |
|
| 652 |
#num_train_image_list = [6000]#[60]#[8000]#[1000]#[100]#
|
| 653 |
def train(rank, world_size, local_world_size, master_addr, master_port):
|
| 654 |
-
|
| 655 |
ddp_setup(rank, world_size, local_world_size, master_addr, master_port)
|
| 656 |
-
|
| 657 |
local_rank = rank % local_world_size
|
| 658 |
torch.cuda.set_device(local_rank)
|
| 659 |
-
|
| 660 |
print(f"rank = {rank}, local_rank = {local_rank}, world_size = {world_size}, local_world_size = {local_world_size}")
|
| 661 |
|
| 662 |
config = TrainConfig()
|
|
|
|
| 651 |
|
| 652 |
#num_train_image_list = [6000]#[60]#[8000]#[1000]#[100]#
|
| 653 |
def train(rank, world_size, local_world_size, master_addr, master_port):
|
| 654 |
+
print("before ddp_setup")
|
| 655 |
ddp_setup(rank, world_size, local_world_size, master_addr, master_port)
|
| 656 |
+
print("after ddp_setup")
|
| 657 |
local_rank = rank % local_world_size
|
| 658 |
torch.cuda.set_device(local_rank)
|
| 659 |
+
print("after set device")
|
| 660 |
print(f"rank = {rank}, local_rank = {local_rank}, world_size = {world_size}, local_world_size = {local_world_size}")
|
| 661 |
|
| 662 |
config = TrainConfig()
|
phoenix_diffusion.sbatch
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
-
#SBATCH -N1 --gpus-per-node=RTX_6000:
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=32G # Memory per core
|
| 8 |
#SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
|
|
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
+
#SBATCH -N1 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=32G # Memory per core
|
| 8 |
#SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
|