0816-110239
Browse files- diffusion.py +4 -4
- phoenix_diffusion.sbatch +3 -3
diffusion.py
CHANGED
|
@@ -269,9 +269,9 @@ class TrainConfig:
|
|
| 269 |
# dim = 2
|
| 270 |
dim = 3#2
|
| 271 |
stride = (2,4) if dim == 2 else (2,2,2)
|
| 272 |
-
num_image =
|
| 273 |
-
batch_size = 1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 274 |
-
n_epoch =
|
| 275 |
HII_DIM = 64
|
| 276 |
num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 277 |
channel = 1
|
|
@@ -743,7 +743,7 @@ if __name__ == "__main__":
|
|
| 743 |
parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
|
| 744 |
parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
|
| 745 |
parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
|
| 746 |
-
parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1)
|
| 747 |
|
| 748 |
args = parser.parse_args()
|
| 749 |
|
|
|
|
| 269 |
# dim = 2
|
| 270 |
dim = 3#2
|
| 271 |
stride = (2,4) if dim == 2 else (2,2,2)
|
| 272 |
+
num_image = 3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 273 |
+
batch_size = 2#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 274 |
+
n_epoch = 40#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 275 |
HII_DIM = 64
|
| 276 |
num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 277 |
channel = 1
|
|
|
|
| 743 |
parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
|
| 744 |
parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
|
| 745 |
parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
|
| 746 |
+
parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1) # as tested, higher value leads to slower training and higher loss in the end
|
| 747 |
|
| 748 |
args = parser.parse_args()
|
| 749 |
|
phoenix_diffusion.sbatch
CHANGED
|
@@ -2,10 +2,10 @@
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
-
#SBATCH -
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=16G # Memory per core
|
| 8 |
-
#SBATCH -t
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|
| 10 |
#SBATCH --error=error-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
|
|
@@ -34,6 +34,6 @@ srun python diffusion.py \
|
|
| 34 |
--resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
|
| 35 |
--num_new_img_per_gpu 50 \
|
| 36 |
--max_num_img_per_gpu 5 \
|
| 37 |
-
--gradient_accumulation_steps
|
| 38 |
######################################################################################
|
| 39 |
|
|
|
|
| 2 |
#SBATCH -J diffusion # Job name
|
| 3 |
#SBATCH -A gts-jw254-coda20
|
| 4 |
#SBATCH -qembers
|
| 5 |
+
#SBATCH -N4 --gpus-per-node=V100:1 -C V100-16GB # Number of nodes and cores per node required
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=16G # Memory per core
|
| 8 |
+
#SBATCH -t 08:00:00 # Duration of the job (Ex: 15 mins)
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|
| 10 |
#SBATCH --error=error-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
|
|
|
|
| 34 |
--resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
|
| 35 |
--num_new_img_per_gpu 50 \
|
| 36 |
--max_num_img_per_gpu 5 \
|
| 37 |
+
--gradient_accumulation_steps 1 \
|
| 38 |
######################################################################################
|
| 39 |
|