0728-1322
Browse files- diffusion.py +17 -10
- phoenix_diffusion.sbatch +1 -1
diffusion.py
CHANGED
|
@@ -75,15 +75,22 @@ import socket
|
|
| 75 |
import sys
|
| 76 |
# %%
|
| 77 |
def ddp_setup(rank: int, world_size: int, local_world_size, master_addr, master_port):
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# %%
|
| 89 |
# notebook_login()
|
|
@@ -736,7 +743,7 @@ if __name__ == "__main__":
|
|
| 736 |
#master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
|
| 737 |
master_addr = os.environ.get("MASTER_ADDR", "localhost")
|
| 738 |
master_port = "12355"
|
| 739 |
-
world_size = int(os.environ["SLURM_NTASKS"])
|
| 740 |
local_world_size = torch.cuda.device_count()
|
| 741 |
|
| 742 |
############################ training ################################
|
|
|
|
| 75 |
import sys
|
| 76 |
# %%
|
| 77 |
def ddp_setup(rank: int, world_size: int, local_world_size, master_addr, master_port):
|
| 78 |
+
"""
|
| 79 |
+
Args:
|
| 80 |
+
rank: Unique identifier of each process
|
| 81 |
+
world_size: Total number of processes
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
print("inside ddp_setup")
|
| 85 |
+
os.environ["MASTER_ADDR"] = master_addr
|
| 86 |
+
os.environ["MASTER_PORT"] = master_port
|
| 87 |
+
print("ddp_setup, rank =", rank)
|
| 88 |
+
init_process_group(
|
| 89 |
+
backend="nccl",
|
| 90 |
+
init_method=f"tcp://{master_addr}:{master_port}",
|
| 91 |
+
rank=rank,
|
| 92 |
+
world_size=world_size*local_world_size
|
| 93 |
+
)
|
| 94 |
|
| 95 |
# %%
|
| 96 |
# notebook_login()
|
|
|
|
| 743 |
#master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
|
| 744 |
master_addr = os.environ.get("MASTER_ADDR", "localhost")
|
| 745 |
master_port = "12355"
|
| 746 |
+
world_size = 1#int(os.environ["SLURM_NTASKS"])
|
| 747 |
local_world_size = torch.cuda.device_count()
|
| 748 |
|
| 749 |
############################ training ################################
|
phoenix_diffusion.sbatch
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
-
#SBATCH -
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=32G # Memory per core
|
| 8 |
#SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
|
|
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
+
#SBATCH -N2 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=32G # Memory per core
|
| 8 |
#SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
|