0803-182144
Browse files- diffusion.py +8 -8
- phoenix_diffusion.sbatch +6 -7
- quantify_results.ipynb +0 -0
diffusion.py
CHANGED
|
@@ -260,7 +260,7 @@ class TrainConfig:
|
|
| 260 |
stride = (2,4) if dim == 2 else (2,2,2)
|
| 261 |
num_image = 3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 262 |
batch_size = 10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 263 |
-
n_epoch = 50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 264 |
HII_DIM = 64
|
| 265 |
num_redshift = 512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 266 |
channel = 1
|
|
@@ -711,7 +711,7 @@ def generate_samples(rank, world_size, local_world_size, master_addr, master_por
|
|
| 711 |
if __name__ == "__main__":
|
| 712 |
parser = argparse.ArgumentParser()
|
| 713 |
parser.add_argument("--train", type=int, required=False, help="whether to train the model", default=1)
|
| 714 |
-
parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
|
| 715 |
parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
|
| 716 |
parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
|
| 717 |
parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
|
|
@@ -725,7 +725,7 @@ if __name__ == "__main__":
|
|
| 725 |
world_size = local_world_size * total_nodes #6#int(os.environ["SLURM_NTASKS"])
|
| 726 |
|
| 727 |
############################ training ################################
|
| 728 |
-
if args.train:
|
| 729 |
print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
|
| 730 |
mp.spawn(
|
| 731 |
train,
|
|
@@ -734,7 +734,7 @@ if __name__ == "__main__":
|
|
| 734 |
join=True,
|
| 735 |
)
|
| 736 |
############################ sampling ################################
|
| 737 |
-
if args.
|
| 738 |
num_new_img_per_gpu = args.num_new_img_per_gpu#200#4#200
|
| 739 |
max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
|
| 740 |
config = TrainConfig()
|
|
@@ -748,10 +748,10 @@ if __name__ == "__main__":
|
|
| 748 |
# return_dict = manager.dict()
|
| 749 |
params_pairs = [
|
| 750 |
(4.4, 131.341),
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
]
|
| 756 |
|
| 757 |
for params in params_pairs:
|
|
|
|
| 260 |
stride = (2,4) if dim == 2 else (2,2,2)
|
| 261 |
num_image = 3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 262 |
batch_size = 10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 263 |
+
n_epoch = 1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 264 |
HII_DIM = 64
|
| 265 |
num_redshift = 512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 266 |
channel = 1
|
|
|
|
| 711 |
if __name__ == "__main__":
|
| 712 |
parser = argparse.ArgumentParser()
|
| 713 |
parser.add_argument("--train", type=int, required=False, help="whether to train the model", default=1)
|
| 714 |
+
#parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
|
| 715 |
parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
|
| 716 |
parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
|
| 717 |
parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
|
|
|
|
| 725 |
world_size = local_world_size * total_nodes #6#int(os.environ["SLURM_NTASKS"])
|
| 726 |
|
| 727 |
############################ training ################################
|
| 728 |
+
if args.train == 1:
|
| 729 |
print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
|
| 730 |
mp.spawn(
|
| 731 |
train,
|
|
|
|
| 734 |
join=True,
|
| 735 |
)
|
| 736 |
############################ sampling ################################
|
| 737 |
+
if args.train == 0:
|
| 738 |
num_new_img_per_gpu = args.num_new_img_per_gpu#200#4#200
|
| 739 |
max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
|
| 740 |
config = TrainConfig()
|
|
|
|
| 748 |
# return_dict = manager.dict()
|
| 749 |
params_pairs = [
|
| 750 |
(4.4, 131.341),
|
| 751 |
+
(5.6, 19.037),
|
| 752 |
+
(4.699, 30),
|
| 753 |
+
(5.477, 200),
|
| 754 |
+
(4.8, 131.341),
|
| 755 |
]
|
| 756 |
|
| 757 |
for params in params_pairs:
|
phoenix_diffusion.sbatch
CHANGED
|
@@ -2,10 +2,10 @@
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
-
#SBATCH -
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
-
#SBATCH --mem-per-gpu=
|
| 8 |
-
#SBATCH -t
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|
| 10 |
#SBATCH --error=error-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
|
|
@@ -30,10 +30,9 @@ export MASTER_PORT=$MASTER_PORT
|
|
| 30 |
|
| 31 |
srun python diffusion.py \
|
| 32 |
--train 1 \
|
| 33 |
-
--
|
| 34 |
-
--
|
| 35 |
-
--
|
| 36 |
-
--max_num_img_per_gpu 40 \
|
| 37 |
|
| 38 |
######################################################################################
|
| 39 |
|
|
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
+
#SBATCH -N8 --gpus-per-node=RTX_6000:1 # -C A100-80GB # Number of nodes and cores per node required
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
+
#SBATCH --mem-per-gpu=8G # Memory per core
|
| 8 |
+
#SBATCH -t 00:05:00 # Duration of the job (Ex: 15 mins)
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|
| 10 |
#SBATCH --error=error-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
|
|
|
|
| 30 |
|
| 31 |
srun python diffusion.py \
|
| 32 |
--train 1 \
|
| 33 |
+
--resume outputs/model_state-N3000-device_count1-node4-epoch49-172.27.149.191 \
|
| 34 |
+
--num_new_img_per_gpu 50 \
|
| 35 |
+
--max_num_img_per_gpu 10 \
|
|
|
|
| 36 |
|
| 37 |
######################################################################################
|
| 38 |
|
quantify_results.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|