Xsmos commited on
Commit
e903b59
·
verified ·
1 Parent(s): 9cc45de
Files changed (2) hide show
  1. perlmutter_diffusion.sbatch +8 -8
  2. tensorboard.ipynb +4 -4
perlmutter_diffusion.sbatch CHANGED
@@ -2,10 +2,10 @@
2
  #SBATCH -A m4717
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu&hbm80g
5
- #SBATCH -q regular #shared
6
- #SBATCH -N5
7
- #SBATCH --gpus-per-node=4
8
- #SBATCH -t 48:00:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -40,12 +40,12 @@ srun python diffusion.py \
40
  --batch_size 2 \
41
  --n_epoch 80 \
42
  --channel_mult 0.5 1 2 4 4 8 \
43
- --num_new_img_per_gpu 800 \
44
- --max_num_img_per_gpu 100 \
45
  --gradient_accumulation_steps 1 \
46
  --autocast 1 \
47
  --use_checkpoint 1 \
48
- --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
49
- #--resume ./outputs/model-N6400-device_count4-node1-epoch199-05185634 \
50
 
51
  date
 
2
  #SBATCH -A m4717
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu&hbm80g
5
+ #SBATCH -q shared #regular
6
+ #SBATCH -N1
7
+ #SBATCH --gpus-per-node=1
8
+ #SBATCH -t 6:00:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
40
  --batch_size 2 \
41
  --n_epoch 80 \
42
  --channel_mult 0.5 1 2 4 4 8 \
43
+ --num_new_img_per_gpu 9 \
44
+ --max_num_img_per_gpu 3 \
45
  --gradient_accumulation_steps 1 \
46
  --autocast 1 \
47
  --use_checkpoint 1 \
48
+ --resume ./outputs/model-N1280-device_count4-node5-epoch34-13133235 \
49
+ #--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
50
 
51
  date
tensorboard.ipynb CHANGED
@@ -23,13 +23,13 @@
23
  "data": {
24
  "text/html": [
25
  "\n",
26
- " <iframe id=\"tensorboard-frame-8bbb5cb424abc4b5\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
27
  " </iframe>\n",
28
  " <script>\n",
29
  " (function() {\n",
30
- " const frame = document.getElementById(\"tensorboard-frame-8bbb5cb424abc4b5\");\n",
31
  " const url = new URL(\"/\", window.location);\n",
32
- " const port = 34693;\n",
33
  " if (port) {\n",
34
  " url.port = port;\n",
35
  " }\n",
@@ -59,7 +59,7 @@
59
  {
60
  "data": {
61
  "text/html": [
62
- "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/34693/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/34693/</a>"
63
  ],
64
  "text/plain": [
65
  "<IPython.core.display.HTML object>"
 
23
  "data": {
24
  "text/html": [
25
  "\n",
26
+ " <iframe id=\"tensorboard-frame-c11f24d3c7445b04\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
27
  " </iframe>\n",
28
  " <script>\n",
29
  " (function() {\n",
30
+ " const frame = document.getElementById(\"tensorboard-frame-c11f24d3c7445b04\");\n",
31
  " const url = new URL(\"/\", window.location);\n",
32
+ " const port = 46861;\n",
33
  " if (port) {\n",
34
  " url.port = port;\n",
35
  " }\n",
 
59
  {
60
  "data": {
61
  "text/html": [
62
+ "<a href=\"https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/46861/\">https://jupyter.nersc.gov/user/binxia/perlmutter-login-node-base/proxy/46861/</a>"
63
  ],
64
  "text/plain": [
65
  "<IPython.core.display.HTML object>"