20145032

Browse files

Files changed (5) hide show

diffusion.py +9 -6
environment.yml +346 -0
frontera_diffusion.sbatch +33 -0
load_h5.py +4 -3
phoenix_diffusion.sbatch +4 -4

diffusion.py CHANGED Viewed

@@ -30,7 +30,7 @@
 import logging
 #logging.getLogger("torch").setLevel(logging.ERROR)
 import warnings
-#warnings.filterwarnings("ignore", message=r"^Detected kernel version")
 from dataclasses import dataclass
 #import h5py
@@ -269,11 +269,12 @@ class TrainConfig:
     # dim = 2
     dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,2)
-    num_image = 2000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
-    n_epoch = 20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
     channel = 1
     img_shape = (channel, HII_DIM, num_redshift) if dim == 2 else (channel, HII_DIM, HII_DIM, num_redshift)
@@ -444,6 +445,7 @@ class DDPM21CM:
             idx = "random",#'range',
             HII_DIM=self.config.HII_DIM,
             num_redshift=self.config.num_redshift,
             drop_prob=self.config.drop_prob,
             dim=self.config.dim,
             ranges_dict=self.ranges_dict,
@@ -740,7 +742,7 @@ def generate_samples(rank, world_size, local_world_size, master_addr, master_por
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--train", type=int, required=False, help="whether to train the model", default=1)
     #parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
     parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
     parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
@@ -758,7 +760,8 @@ if __name__ == "__main__":
     config = TrainConfig()
     config.gradient_accumulation_steps = args.gradient_accumulation_steps
     ############################ training ################################
-    if args.train == 1:
         print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
         mp.spawn(
                 train,
@@ -767,7 +770,7 @@ if __name__ == "__main__":
                 join=True,
                 )
     ############################ sampling ################################
-    if args.train == 0:
         num_new_img_per_gpu = args.num_new_img_per_gpu#200#4#200
         max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
         #config = TrainConfig()

 import logging
 #logging.getLogger("torch").setLevel(logging.ERROR)
 import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
 from dataclasses import dataclass
 #import h5py
     # dim = 2
     dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,2)
+    num_image = 3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
+    n_epoch = 2#0#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
+    startat = 512-num_redshift
     channel = 1
     img_shape = (channel, HII_DIM, num_redshift) if dim == 2 else (channel, HII_DIM, HII_DIM, num_redshift)
             idx = "random",#'range',
             HII_DIM=self.config.HII_DIM,
             num_redshift=self.config.num_redshift,
+            startat=self.config.startat,
             drop_prob=self.config.drop_prob,
             dim=self.config.dim,
             ranges_dict=self.ranges_dict,
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--train", type=str, required=False, help="whether to train the model", default=False)
     #parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
     parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
     parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
     config = TrainConfig()
     config.gradient_accumulation_steps = args.gradient_accumulation_steps
     ############################ training ################################
+    if args.train:
+        config.dataset_name = args.train
         print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
         mp.spawn(
                 train,
                 join=True,
                 )
     ############################ sampling ################################
+    if args.resume:
         num_new_img_per_gpu = args.num_new_img_per_gpu#200#4#200
         max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
         #config = TrainConfig()

environment.yml ADDED Viewed

	@@ -0,0 +1,346 @@

+name: diffusers
+channels:
+  - anaconda
+  - fastai
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - abseil-cpp=20211102.0=h27087fc_1
+  - absl-py=2.1.0=pyhd8ed1ab_0
+  - accelerate=0.28.0=pyhd8ed1ab_0
+  - aiohttp=3.9.3=py39hd1e30aa_1
+  - aiosignal=1.3.1=pyhd8ed1ab_0
+  - annotated-types=0.7.0=pyhd8ed1ab_0
+  - anyio=4.4.0=pyhd8ed1ab_0
+  - argon2-cffi=23.1.0=pyhd8ed1ab_0
+  - argon2-cffi-bindings=21.2.0=py39hd1e30aa_4
+  - arrow=1.3.0=pyhd8ed1ab_0
+  - arrow-cpp=14.0.2=h374c478_1
+  - asttokens=2.4.1=pyhd8ed1ab_0
+  - async-lru=2.0.4=pyhd8ed1ab_0
+  - async-timeout=4.0.3=pyhd8ed1ab_0
+  - attrs=23.2.0=pyh71513ae_0
+  - aws-c-auth=0.6.19=h5eee18b_0
+  - aws-c-cal=0.5.20=hdbd6064_0
+  - aws-c-common=0.8.5=h5eee18b_0
+  - aws-c-compression=0.2.16=h5eee18b_0
+  - aws-c-event-stream=0.2.15=h6a678d5_0
+  - aws-c-http=0.6.25=h5eee18b_0
+  - aws-c-io=0.13.10=h5eee18b_0
+  - aws-c-mqtt=0.7.13=h5eee18b_0
+  - aws-c-s3=0.1.51=hdbd6064_0
+  - aws-c-sdkutils=0.1.6=h5eee18b_0
+  - aws-checksums=0.1.13=h5eee18b_0
+  - aws-crt-cpp=0.18.16=h6a678d5_0
+  - aws-sdk-cpp=1.10.55=h721c034_0
+  - babel=2.14.0=pyhd8ed1ab_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - beautifulsoup4=4.12.3=pyha770c72_0
+  - blas=1.0=mkl
+  - bleach=6.1.0=pyhd8ed1ab_0
+  - boost-cpp=1.84.0=h44aadfe_2
+  - brotli=1.0.9=h9c3ff4c_4
+  - brotli-python=1.0.9=py39h5a03fae_9
+  - bzip2=1.0.8=hd590300_5
+  - c-ares=1.27.0=hd590300_0
+  - ca-certificates=2024.7.4=hbcca054_0
+  - cached-property=1.5.2=hd8ed1ab_1
+  - cached_property=1.5.2=pyha770c72_1
+  - catalogue=2.0.10=py39hf3d152e_0
+  - certifi=2024.7.4=pyhd8ed1ab_0
+  - cffi=1.16.0=py39h7a31438_0
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - click=8.1.7=unix_pyh707e725_0
+  - cloudpathlib=0.18.1=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.2.2=pyhd8ed1ab_0
+  - confection=0.1.4=py39h8003fee_0
+  - contourpy=1.2.0=py39h7633fee_0
+  - cycler=0.12.1=pyhd8ed1ab_0
+  - cymem=2.0.8=py39h3d6467e_1
+  - cyrus-sasl=2.1.28=h52b45da_1
+  - cython-blis=0.7.10=py39h44dd56e_2
+  - dataclasses=0.8=pyhc8e2a94_3
+  - datasets=2.17.1=pyhd8ed1ab_0
+  - dbus=1.13.18=hb2f20db_0
+  - debugpy=1.8.1=py39h3d6467e_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - diffusers=0.27.1=pyhd8ed1ab_0
+  - dill=0.3.8=pyhd8ed1ab_0
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - exceptiongroup=1.2.0=pyhd8ed1ab_2
+  - executing=2.0.1=pyhd8ed1ab_0
+  - expat=2.6.2=h59595ed_0
+  - fastai=2.7.15=py_0
+  - fastcore=1.5.48=pyhd8ed1ab_0
+  - fastdownload=0.0.7=pyhd8ed1ab_0
+  - fastprogress=1.0.3=pyhd8ed1ab_0
+  - filelock=3.13.3=pyhd8ed1ab_0
+  - fontconfig=2.14.2=h14ed4e7_0
+  - fonttools=4.50.0=py39hd1e30aa_0
+  - fqdn=1.5.1=pyhd8ed1ab_0
+  - freetype=2.12.1=h267a509_2
+  - frozenlist=1.4.1=py39hd1e30aa_0
+  - fsspec=2023.10.0=pyhca7485f_0
+  - gflags=2.2.2=he1b5a44_1004
+  - glib=2.80.0=hf2295e7_1
+  - glib-tools=2.80.0=hde27a5a_1
+  - glog=0.5.0=h48cff8f_0
+  - gmp=6.3.0=h59595ed_1
+  - gmpy2=2.1.2=py39h376b7d2_1
+  - grpc-cpp=1.48.2=he1ff14a_1
+  - grpcio=1.48.2=py39he1ff14a_1
+  - gst-plugins-base=1.14.1=h6a678d5_1
+  - gstreamer=1.14.1=h5eee18b_1
+  - h11=0.14.0=pyhd8ed1ab_0
+  - h2=4.1.0=py39hf3d152e_0
+  - h5py=3.10.0=nompi_py39h2c511df_101
+  - hdf5=1.14.3=nompi_h4f84152_100
+  - hpack=4.0.0=pyh9f0ad1d_0
+  - httpcore=1.0.5=pyhd8ed1ab_0
+  - httpx=0.27.0=pyhd8ed1ab_0
+  - huggingface_hub=0.22.1=pyhd8ed1ab_0
+  - hyperframe=6.0.1=pyhd8ed1ab_0
+  - icu=73.2=h59595ed_0
+  - idna=3.6=pyhd8ed1ab_0
+  - importlib-metadata=7.1.0=pyha770c72_0
+  - importlib-resources=6.4.0=pyhd8ed1ab_0
+  - importlib_metadata=7.1.0=hd8ed1ab_0
+  - importlib_resources=6.4.0=pyhd8ed1ab_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - ipykernel=6.29.3=pyhd33586a_0
+  - ipython=8.15.0=py39h06a4308_0
+  - isoduration=20.11.0=pyhd8ed1ab_0
+  - jedi=0.19.1=pyhd8ed1ab_0
+  - jinja2=3.1.3=pyhd8ed1ab_0
+  - joblib=1.4.2=pyhd8ed1ab_0
+  - jpeg=9e=h0b41bf4_3
+  - json5=0.9.25=pyhd8ed1ab_0
+  - jsonpointer=3.0.0=py39hf3d152e_0
+  - jsonschema=4.22.0=pyhd8ed1ab_0
+  - jsonschema-specifications=2023.12.1=pyhd8ed1ab_0
+  - jsonschema-with-format-nongpl=4.22.0=pyhd8ed1ab_0
+  - jupyter-lsp=2.2.5=pyhd8ed1ab_0
+  - jupyter_client=8.6.1=pyhd8ed1ab_0
+  - jupyter_core=5.7.2=py39hf3d152e_0
+  - jupyter_events=0.10.0=pyhd8ed1ab_0
+  - jupyter_server=2.14.1=pyhd8ed1ab_0
+  - jupyter_server_terminals=0.5.3=pyhd8ed1ab_0
+  - jupyterlab=4.2.3=pyhd8ed1ab_0
+  - jupyterlab_pygments=0.3.0=pyhd8ed1ab_1
+  - jupyterlab_server=2.27.2=pyhd8ed1ab_0
+  - keyutils=1.6.1=h166bdaf_0
+  - kiwisolver=1.4.5=py39h7633fee_1
+  - krb5=1.20.1=h81ceb04_0
+  - langcodes=3.4.0=pyhd8ed1ab_0
+  - language-data=1.2.0=pyhd8ed1ab_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.40=h41732ed_0
+  - lerc=3.0=h295c915_0
+  - libaec=1.1.3=h59595ed_0
+  - libboost=1.84.0=h8013b2b_2
+  - libboost-devel=1.84.0=h00ab1b0_2
+  - libboost-headers=1.84.0=ha770c72_2
+  - libbrotlicommon=1.0.9=h166bdaf_9
+  - libbrotlidec=1.0.9=h166bdaf_9
+  - libbrotlienc=1.0.9=h166bdaf_9
+  - libclang=14.0.6=default_hc6dbbc7_1
+  - libclang13=14.0.6=default_he11475f_1
+  - libcups=2.3.3=h36d4200_3
+  - libcurl=8.5.0=h251f7ec_0
+  - libdeflate=1.17=h5eee18b_1
+  - libedit=3.1.20230828=h5eee18b_0
+  - libev=4.33=hd590300_2
+  - libevent=2.1.10=h28343ad_4
+  - libexpat=2.6.2=h59595ed_0
+  - libffi=3.4.2=h7f98852_5
+  - libgcc-ng=13.2.0=h807b86a_5
+  - libgfortran-ng=13.2.0=h69a702a_5
+  - libgfortran5=13.2.0=ha4646dd_5
+  - libglib=2.80.0=hf2295e7_1
+  - libgomp=13.2.0=h807b86a_5
+  - libhwloc=2.9.3=default_h554bfaf_1009
+  - libiconv=1.17=hd590300_2
+  - libllvm14=14.0.6=hcd5def8_4
+  - libnghttp2=1.58.0=h47da74e_1
+  - libnsl=2.0.1=hd590300_0
+  - libpng=1.6.43=h2797004_0
+  - libpq=12.17=hdbd6064_0
+  - libprotobuf=3.20.3=h3eb15da_0
+  - libsodium=1.0.18=h36c2ea0_1
+  - libsqlite=3.45.2=h2797004_0
+  - libssh2=1.11.0=h0841786_0
+  - libstdcxx-ng=13.2.0=h7e041cc_5
+  - libthrift=0.15.0=h362ad58_1
+  - libtiff=4.5.1=h6a678d5_0
+  - libuuid=2.38.1=h0b41bf4_0
+  - libwebp-base=1.3.2=hd590300_0
+  - libxcb=1.15=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libxkbcommon=1.7.0=h662e7e4_0
+  - libxml2=2.12.6=h232c23b_1
+  - libzlib=1.2.13=hd590300_5
+  - llvm-openmp=18.1.2=h4dfa4b3_0
+  - lz4-c=1.9.4=hcb278e6_0
+  - marisa-trie=1.1.0=py39h3d6467e_1
+  - markdown=3.6=pyhd8ed1ab_0
+  - markdown-it-py=3.0.0=pyhd8ed1ab_0
+  - markupsafe=2.1.5=py39hd1e30aa_0
+  - matplotlib=3.8.3=py39hf3d152e_0
+  - matplotlib-base=3.8.3=py39he9076e7_0
+  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
+  - mdurl=0.1.2=pyhd8ed1ab_0
+  - mistune=3.0.2=pyhd8ed1ab_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py39h5eee18b_1
+  - mkl_fft=1.3.8=py39h5eee18b_0
+  - mkl_random=1.2.4=py39hdb19cb5_0
+  - mpc=1.3.1=hfe3b2da_0
+  - mpfr=4.2.1=h9458935_0
+  - mpmath=1.3.0=pyhd8ed1ab_0
+  - multidict=6.0.5=py39hd1e30aa_0
+  - multiprocess=0.70.16=py39hd1e30aa_0
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - murmurhash=1.0.10=py39h3d6467e_1
+  - mysql=5.7.24=h721c034_2
+  - nbclient=0.10.0=pyhd8ed1ab_0
+  - nbconvert-core=7.16.4=pyhd8ed1ab_1
+  - nbformat=5.10.4=pyhd8ed1ab_0
+  - ncurses=6.4.20240210=h59595ed_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_0
+  - networkx=3.2.1=pyhd8ed1ab_0
+  - ninja=1.11.1=h924138e_0
+  - notebook=7.2.1=pyhd8ed1ab_0
+  - notebook-shim=0.2.4=pyhd8ed1ab_0
+  - numpy=1.26.4=py39h5f9d8c6_0
+  - numpy-base=1.26.4=py39hb5e798b_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openssl=3.3.1=h4bc722e_2
+  - orc=1.7.4=hb3bc3d3_1
+  - overrides=7.7.0=pyhd8ed1ab_0
+  - packaging=24.0=pyhd8ed1ab_0
+  - pandas=1.4.2=py39h1832856_2
+  - pandocfilters=1.5.0=pyhd8ed1ab_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pcre2=10.43=hcad00b1_0
+  - pexpect=4.9.0=pyhd8ed1ab_0
+  - pickleshare=0.7.5=py_1003
+  - pillow=10.2.0=py39h5eee18b_0
+  - pip=24.0=pyhd8ed1ab_0
+  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1
+  - platformdirs=4.2.0=pyhd8ed1ab_0
+  - ply=3.11=py_1
+  - preshed=3.0.9=py39h3d6467e_1
+  - prometheus_client=0.20.0=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.42=pyha770c72_0
+  - prompt_toolkit=3.0.42=hd8ed1ab_0
+  - protobuf=3.20.3=py39h227be39_1
+  - psutil=5.9.8=py39hd1e30aa_0
+  - pthread-stubs=0.4=h36c2ea0_1001
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pyarrow=14.0.2=py39h1eedbd7_0
+  - pyarrow-hotfix=0.6=pyhd8ed1ab_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pydantic=2.8.2=pyhd8ed1ab_0
+  - pydantic-core=2.20.1=py39h5cde264_0
+  - pygments=2.17.2=pyhd8ed1ab_0
+  - pyparsing=3.1.2=pyhd8ed1ab_0
+  - pyqt=5.15.10=py39h6a678d5_0
+  - pyqt5-sip=12.13.0=py39h5eee18b_0
+  - pysocks=1.7.1=pyha2e5f31_6
+  - python=3.9.19=h0755675_0_cpython
+  - python-dateutil=2.9.0=pyhd8ed1ab_0
+  - python-fastjsonschema=2.20.0=pyhd8ed1ab_0
+  - python-json-logger=2.0.7=pyhd8ed1ab_0
+  - python-tzdata=2024.1=pyhd8ed1ab_0
+  - python-xxhash=3.4.1=py39hd1e30aa_0
+  - python_abi=3.9=4_cp39
+  - pytorch=2.2.0=cpu_py39hdc00b08_0
+  - pytz=2024.1=pyhd8ed1ab_0
+  - pyyaml=6.0.1=py39hd1e30aa_1
+  - pyzmq=25.1.2=py39h8c080ef_0
+  - qt-main=5.15.2=h53bd1ea_10
+  - re2=2022.04.01=h27087fc_0
+  - readline=8.2=h8228510_1
+  - referencing=0.35.1=pyhd8ed1ab_0
+  - regex=2023.12.25=py39hd1e30aa_0
+  - requests=2.31.0=pyhd8ed1ab_0
+  - rfc3339-validator=0.1.4=pyhd8ed1ab_0
+  - rfc3986-validator=0.1.1=pyh9f0ad1d_0
+  - rich=13.7.1=pyhd8ed1ab_0
+  - rpds-py=0.18.1=py39ha68c5e3_0
+  - s2n=1.3.27=hdbd6064_0
+  - safetensors=0.4.2=py39h9fdd4d6_0
+  - scikit-learn=1.5.1=py39hf7b0125_0
+  - scipy=1.11.3=py39h5f9d8c6_0
+  - send2trash=1.8.3=pyh0d859eb_0
+  - setuptools=69.2.0=pyhd8ed1ab_0
+  - shellingham=1.5.4=pyhd8ed1ab_0
+  - sip=6.7.12=py39h3d6467e_0
+  - six=1.16.0=pyh6c4a22f_0
+  - smart-open=7.0.4=hd8ed1ab_0
+  - smart_open=7.0.4=pyhd8ed1ab_0
+  - snappy=1.1.10=h9fff704_0
+  - sniffio=1.3.1=pyhd8ed1ab_0
+  - soupsieve=2.5=pyhd8ed1ab_1
+  - spacy=3.7.5=py39h95fdab5_0
+  - spacy-legacy=3.0.12=pyhd8ed1ab_0
+  - spacy-loggers=1.0.5=pyhd8ed1ab_0
+  - sqlite=3.45.2=h2c6b66d_0
+  - srsly=2.4.8=py39h3d6467e_1
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - sympy=1.12=pypyh9d50eac_103
+  - tbb=2021.11.0=h00ab1b0_1
+  - tensorboard=2.17.0=pyhd8ed1ab_0
+  - tensorboard-data-server=0.7.0=py39hd4f0224_1
+  - terminado=0.18.1=pyh0d859eb_0
+  - thinc=8.2.3=py39he5d7314_0
+  - threadpoolctl=3.5.0=pyhc1e730c_0
+  - tinycss2=1.3.0=pyhd8ed1ab_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - torchvision=0.14.1=cpu_py39hcda3413_0
+  - tornado=6.4=py39hd1e30aa_0
+  - tqdm=4.66.2=pyhd8ed1ab_0
+  - traitlets=5.14.2=pyhd8ed1ab_0
+  - typer=0.12.3=pyhd8ed1ab_0
+  - typer-slim=0.12.3=pyhd8ed1ab_0
+  - typer-slim-standard=0.12.3=hd8ed1ab_0
+  - types-python-dateutil=2.9.0.20240316=pyhd8ed1ab_0
+  - typing-extensions=4.10.0=hd8ed1ab_0
+  - typing_extensions=4.10.0=pyha770c72_0
+  - typing_utils=0.1.0=pyhd8ed1ab_0
+  - tzdata=2024a=h0c530f3_0
+  - unicodedata2=15.1.0=py39hd1e30aa_0
+  - uri-template=1.3.0=pyhd8ed1ab_0
+  - urllib3=2.2.1=pyhd8ed1ab_0
+  - utf8proc=2.6.1=h5eee18b_1
+  - wasabi=1.1.2=py39hf3d152e_1
+  - wcwidth=0.2.13=pyhd8ed1ab_0
+  - weasel=0.4.1=pyhd8ed1ab_1
+  - webcolors=24.6.0=pyhd8ed1ab_0
+  - webencodings=0.5.1=pyhd8ed1ab_2
+  - websocket-client=1.8.0=pyhd8ed1ab_0
+  - werkzeug=3.0.1=pyhd8ed1ab_0
+  - wheel=0.43.0=pyhd8ed1ab_0
+  - wrapt=1.16.0=py39hd1e30aa_0
+  - xkeyboard-config=2.41=hd590300_0
+  - xorg-kbproto=1.0.7=h7f98852_1002
+  - xorg-libx11=1.8.7=h8ee46fc_0
+  - xorg-libxau=1.0.11=hd590300_0
+  - xorg-libxdmcp=1.1.3=h7f98852_0
+  - xorg-xextproto=7.3.0=h0b41bf4_1003
+  - xorg-xproto=7.0.31=h7f98852_1007
+  - xxhash=0.8.2=hd590300_0
+  - xz=5.4.6=h5eee18b_0
+  - yaml=0.2.5=h7f98852_2
+  - yarl=1.9.4=py39hd1e30aa_0
+  - zeromq=4.3.5=h59595ed_1
+  - zipp=3.17.0=pyhd8ed1ab_0
+  - zlib=1.2.13=hd590300_5
+  - zstd=1.5.5=hfc55251_0
+prefix: /storage/home/hcoda1/3/bxia34/.conda/envs/diffusers

frontera_diffusion.sbatch ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+#SBATCH -J diffusion # Job name
+#SBATCH -p rtx-dev
+#SBATCH -N2            # Number of nodes and cores per node required
+#SBATCH --ntasks-per-node=1
+#SBATCH -t 02:00:00                                    # Duration of the job (Ex: 15 mins)
+#SBATCH -oReport-%j                         # Combined output and error messages file
+#SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
+python -c "import torch; print('torch.cuda.is_available() =', torch.cuda.is_available()); print('torch.__version__ =', torch.__version__); print('torch.version.cuda =', torch.version.cuda)"
+pwd
+date
+#module load anaconda3/2022.05 # Load module dependencies
+#module load pytorch
+#conda activate diffusers
+conda env list
+module list
+cat $0
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
+export MASTER_ADDR=$MASTER_ADDR
+export MASTER_PORT=$MASTER_PORT
+srun python diffusion.py \
+    --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
+    --resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
+    --num_new_img_per_gpu 50 \
+    --max_num_img_per_gpu 2 \
+    --gradient_accumulation_steps 40 \
+######################################################################################

load_h5.py CHANGED Viewed

@@ -44,6 +44,7 @@ class Dataset4h5(Dataset):
         transform=True,
         ranges_dict=None,
         num_workers=len(os.sched_getaffinity(0))//torch.cuda.device_count(),
         # shuffle=False,
         ):
         super().__init__()
@@ -59,7 +60,7 @@ class Dataset4h5(Dataset):
         self.dim = dim
         self.transform = transform
         self.num_workers = num_workers
         # if ranges_dict == None:
         #     ranges_dict = dict(
         #         images = {
@@ -156,10 +157,10 @@ class Dataset4h5(Dataset):
         with h5py.File(self.dir_name, 'r') as f:
             images_start = time()
             if self.dim == 2:
-                images = f[self.field][idx,0,:self.HII_DIM,-self.num_redshift:][:,None]
                 # images = f[self.field][idx,:self.HII_DIM,:self.HII_DIM,-3][:,None]
             elif self.dim == 3:
-                images = f[self.field][idx,:self.HII_DIM,:self.HII_DIM,-self.num_redshift:][:,None]
             images_end = time()
             # print(f"pid {pid}: images of shape {images.shape} loaded after {load_end-load_start:.3f} s")
             pid = os.getpid()

         transform=True,
         ranges_dict=None,
         num_workers=len(os.sched_getaffinity(0))//torch.cuda.device_count(),
+        startat=0,
         # shuffle=False,
         ):
         super().__init__()
         self.dim = dim
         self.transform = transform
         self.num_workers = num_workers
+        self.startat = startat
         # if ranges_dict == None:
         #     ranges_dict = dict(
         #         images = {
         with h5py.File(self.dir_name, 'r') as f:
             images_start = time()
             if self.dim == 2:
+                images = f[self.field][idx, 0, :self.HII_DIM, self.startat:self.startat+self.num_redshift][:,None]
                 # images = f[self.field][idx,:self.HII_DIM,:self.HII_DIM,-3][:,None]
             elif self.dim == 3:
+                images = f[self.field][idx, :self.HII_DIM, :self.HII_DIM, self.startat:self.startat+self.num_redshift][:,None]
             images_end = time()
             # print(f"pid {pid}: images of shape {images.shape} loaded after {load_end-load_start:.3f} s")
             pid = os.getpid()

phoenix_diffusion.sbatch CHANGED Viewed

@@ -2,10 +2,10 @@
 #SBATCH -J diffusion # Job name
 #SBATCH -A gts-jw254-coda20
 #SBATCH -qembers
-#SBATCH -N8 --gpus-per-node=V100:1 -C V100-16GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
-#SBATCH -t 08:00:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
@@ -29,8 +29,8 @@ export MASTER_ADDR=$MASTER_ADDR
 export MASTER_PORT=$MASTER_PORT
 srun python diffusion.py \
-    --train 1 \
-    --resume outputs/model-N2000-device_count1-node8-epoch19-18001622 \
     --num_new_img_per_gpu 50 \
     --max_num_img_per_gpu 2 \
     --gradient_accumulation_steps 40 \

 #SBATCH -J diffusion # Job name
 #SBATCH -A gts-jw254-coda20
 #SBATCH -qembers
+#SBATCH -N1 --gpus-per-node=V100:1 -C V100-16GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
+#SBATCH -t 02:00:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
 export MASTER_PORT=$MASTER_PORT
 srun python diffusion.py \
+    --train 0 \
+    --resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
     --num_new_img_per_gpu 50 \
     --max_num_img_per_gpu 2 \
     --gradient_accumulation_steps 40 \