0726-1631
Browse files- diffusion.py +3 -2
- phoenix_diffusion.sbatch +1 -0
diffusion.py
CHANGED
|
@@ -732,7 +732,8 @@ if __name__ == "__main__":
|
|
| 732 |
parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
|
| 733 |
args = parser.parse_args()
|
| 734 |
|
| 735 |
-
master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
|
|
|
|
| 736 |
master_port = "12355"
|
| 737 |
world_size = int(os.environ["SLURM_NTASKS"])
|
| 738 |
local_world_size = torch.cuda.device_count()
|
|
@@ -740,7 +741,7 @@ if __name__ == "__main__":
|
|
| 740 |
############################ training ################################
|
| 741 |
#world_size = torch.cuda.device_count()
|
| 742 |
if args.train:
|
| 743 |
-
print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
|
| 744 |
mp.spawn(
|
| 745 |
train,
|
| 746 |
args=(world_size, local_world_size, master_addr, master_port),
|
|
|
|
| 732 |
parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
|
| 733 |
args = parser.parse_args()
|
| 734 |
|
| 735 |
+
#master_addr = os.environ["SLURM_NODELIST"].split(",")[0]
|
| 736 |
+
master_addr = os.environ.get("MASTER_ADDR", "localhost")
|
| 737 |
master_port = "12355"
|
| 738 |
world_size = int(os.environ["SLURM_NTASKS"])
|
| 739 |
local_world_size = torch.cuda.device_count()
|
|
|
|
| 741 |
############################ training ################################
|
| 742 |
#world_size = torch.cuda.device_count()
|
| 743 |
if args.train:
|
| 744 |
+
print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
|
| 745 |
mp.spawn(
|
| 746 |
train,
|
| 747 |
args=(world_size, local_world_size, master_addr, master_port),
|
phoenix_diffusion.sbatch
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
#SBATCH -N2 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
|
|
|
|
| 6 |
#SBATCH --mem-per-gpu=32G # Memory per core
|
| 7 |
#SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
|
| 8 |
#SBATCH -oReport-%j # Combined output and error messages file
|
|
|
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
#SBATCH -N2 --gpus-per-node=RTX_6000:3 # -C A100-80GB # Number of nodes and cores per node required
|
| 6 |
+
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=32G # Memory per core
|
| 8 |
#SBATCH -t 10:00 # Duration of the job (Ex: 15 mins)
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|