0815-184556

Browse files

Files changed (6) hide show

context_unet.py +4 -4
diffusion.py +13 -10
frontera_generate_dataset.sbatch +4 -4
generate_dataset.py +2 -2
phoenix_diffusion.sbatch +5 -4
quantify_results.ipynb +62 -11

context_unet.py CHANGED Viewed

@@ -533,7 +533,7 @@ class ContextUnet(nn.Module):
             text_outputs = self.token_embedding(y.to(self.dtype))
             emb = emb + text_outputs.to(emb)
-        print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
         h = x.type(self.dtype)
         #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
@@ -543,7 +543,7 @@ class ContextUnet(nn.Module):
         # print("2,h.shape =", h.shape)
         h = self.middle_block(h, emb)
         #print("middle block, h.shape =", h.shape)
-        print("2, h.dtype =", h.dtype)
         for module in self.output_blocks:
             #print("for module in self.output_blocks, h.shape =", h.shape)
             # print("len(hs) =", len(hs), ", hs[-1].shape =", hs[-1].shape)
@@ -551,9 +551,9 @@ class ContextUnet(nn.Module):
             h = module(h, emb)
             # print("module decoder, h.shape =", h.shape)
-        print("h = h.type(x.dtype), x.dtype =", x.dtype, h.dtype)
         h = h.type(x.dtype)
         h = self.out(h)
-        print("self.out(h)", "h.dtype =", h.dtype)
         return h

             text_outputs = self.token_embedding(y.to(self.dtype))
             emb = emb + text_outputs.to(emb)
+        #print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
         h = x.type(self.dtype)
         #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
         # print("2,h.shape =", h.shape)
         h = self.middle_block(h, emb)
         #print("middle block, h.shape =", h.shape)
+        #print("2, h.dtype =", h.dtype)
         for module in self.output_blocks:
             #print("for module in self.output_blocks, h.shape =", h.shape)
             # print("len(hs) =", len(hs), ", hs[-1].shape =", hs[-1].shape)
             h = module(h, emb)
             # print("module decoder, h.shape =", h.shape)
+        #print("h = h.type(x.dtype), x.dtype =", x.dtype, h.dtype)
         h = h.type(x.dtype)
         h = self.out(h)
+        #print("self.out(h)", "h.dtype =", h.dtype)
         return h

diffusion.py CHANGED Viewed

@@ -33,8 +33,9 @@ import warnings
 #warnings.filterwarnings("ignore", message=r"^Detected kernel version")
 from dataclasses import dataclass
-import h5py
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader, Dataset
 # from datasets import Dataset
@@ -208,9 +209,9 @@ class DDPMScheduler(nn.Module):
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             # print("x_i.shape =", x_i.shape)
-            print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
-            print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             pbar_sample.update(1)
@@ -268,8 +269,8 @@ class TrainConfig:
     # dim = 2
     dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,2)
-    num_image = 30#00#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
-    batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
@@ -522,19 +523,19 @@ class DDPM21CM:
                     # print("x = x.to(self.config.device), x.dtype =", x.dtype)
                     x = x.to(self.config.dtype)
                     # print("x = x.to(self.dtype), x.dtype =", x.dtype)
-                    print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
                     xt, noise, ts = self.ddpm.add_noise(x)
-                    print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
                     if self.config.guide_w == -1:
                         noise_pred = self.nn_model(xt, ts).to(x.dtype)
                     else:
                         c = c.to(self.config.device)
                         noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
-                    print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
                     loss = F.mse_loss(noise, noise_pred)
-                    print(f"loss.dtype =", loss.dtype)
                     self.accelerator.backward(loss)
                     self.accelerator.clip_grad_norm_(self.nn_model.parameters(), 1)
                     self.optimizer.step()
@@ -742,6 +743,7 @@ if __name__ == "__main__":
     parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
     parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
     parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
     args = parser.parse_args()
@@ -766,8 +768,9 @@ if __name__ == "__main__":
         max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
         config = TrainConfig()
         #config.world_size = world_size
-        # config.dtype = torch.float32
         config.resume = args.resume
         # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
         # config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"
         # config.resume = f"./outputs/model_state-N{config.num_image}-device_count1-epoch{config.n_epoch-1}"

 #warnings.filterwarnings("ignore", message=r"^Detected kernel version")
 from dataclasses import dataclass
+#import h5py
 import torch
+#print(f"starting, torch.__path__ = {torch.__path__}, torch.cuda.device_count() = {torch.cuda.device_count()}, torch.cuda.is_available() = {torch.cuda.is_available()}")
 import torch.nn as nn
 from torch.utils.data import DataLoader, Dataset
 # from datasets import Dataset
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             # print("x_i.shape =", x_i.shape)
+            #print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
+            #print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             pbar_sample.update(1)
     # dim = 2
     dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,2)
+    num_image = 300#0#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
+    batch_size = 1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
                     # print("x = x.to(self.config.device), x.dtype =", x.dtype)
                     x = x.to(self.config.dtype)
                     # print("x = x.to(self.dtype), x.dtype =", x.dtype)
+                    # print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
                     xt, noise, ts = self.ddpm.add_noise(x)
+                    # print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
                     if self.config.guide_w == -1:
                         noise_pred = self.nn_model(xt, ts).to(x.dtype)
                     else:
                         c = c.to(self.config.device)
                         noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
+                    # print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
                     loss = F.mse_loss(noise, noise_pred)
+                    #print(f"loss.dtype =", loss.dtype)
                     self.accelerator.backward(loss)
                     self.accelerator.clip_grad_norm_(self.nn_model.parameters(), 1)
                     self.optimizer.step()
     parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
     parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
     parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
+    parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1)
     args = parser.parse_args()
         max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
         config = TrainConfig()
         #config.world_size = world_size
+        config.dtype = torch.float32
         config.resume = args.resume
+        config.gradient_accumulation_steps = args.gradient_accumulation_steps
         # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
         # config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"
         # config.resume = f"./outputs/model_state-N{config.num_image}-device_count1-epoch{config.n_epoch-1}"

frontera_generate_dataset.sbatch CHANGED Viewed

@@ -25,9 +25,9 @@
 #SBATCH -J datasets           # Job name
 #SBATCH -o Report-%j       # Name of stdout output file
-#SBATCH -p normal # Queue (partition) name
-#SBATCH -N 12 # 50              # Total # of nodes
-#SBATCH -t 2-00:00:00        # Run time (hh:mm:ss)
 #SBATCH --mail-type=all    # Send email at begin and end of job
 #SBATCH --mail-user=xiabin@gatech.edu
 #SBATCH --ntasks-per-node=1
@@ -44,7 +44,7 @@ conda env list
 srun python generate_dataset.py \
     --save_direc $SCRATCH \
-    --num_images 25600 \
     --BOX_LEN 128 \
     --HII_DIM 64 \
     --NON_CUBIC_FACTOR 16 \

 #SBATCH -J datasets           # Job name
 #SBATCH -o Report-%j       # Name of stdout output file
+#SBATCH -p small # Queue (partition) name
+#SBATCH -N 2 # 50              # Total # of nodes
+#SBATCH -t 09:00:00        # Run time (hh:mm:ss)
 #SBATCH --mail-type=all    # Send email at begin and end of job
 #SBATCH --mail-user=xiabin@gatech.edu
 #SBATCH --ntasks-per-node=1
 srun python generate_dataset.py \
     --save_direc $SCRATCH \
+    --num_images 800\
     --BOX_LEN 128 \
     --HII_DIM 64 \
     --NON_CUBIC_FACTOR 16 \

generate_dataset.py CHANGED Viewed

@@ -443,8 +443,8 @@ if __name__ == '__main__':
     args = parser.parse_args()
     params_ranges = dict(
-        ION_Tvir_MIN = [4,6],#4.8,#5.477,#4.699,#5.6,#4.4, #[4,6],
-        HII_EFF_FACTOR = [10,250],#131.341,#200,#30,#19.037,#131.341, #[10, 250],
         )
     kwargs = dict(

     args = parser.parse_args()
     params_ranges = dict(
+        ION_Tvir_MIN = 4.8,#5.477,#4.699,#5.6,#4.4, #[4,6],
+        HII_EFF_FACTOR = 131.341,#200,#30,#19.037,#131.341, #[10, 250],
         )
     kwargs = dict(

phoenix_diffusion.sbatch CHANGED Viewed

@@ -2,10 +2,10 @@
 #SBATCH -J diffusion # Job name
 #SBATCH -A gts-jw254-coda20
 #SBATCH -qembers
-#SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
-#SBATCH -t 00:30:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --error=error-%j
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
@@ -17,9 +17,10 @@ pwd
 date
 module load anaconda3/2022.05 # Load module dependencies
 module load pytorch
-module list
 conda activate diffusers
 conda env list
 cat $0
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
@@ -33,6 +34,6 @@ srun python diffusion.py \
     --resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
     --num_new_img_per_gpu 50 \
     --max_num_img_per_gpu 5 \
 ######################################################################################

 #SBATCH -J diffusion # Job name
 #SBATCH -A gts-jw254-coda20
 #SBATCH -qembers
+#SBATCH -N1 --gpus-per-node=V100:1 -C V100-16GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
+#SBATCH -t 01:00:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --error=error-%j
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
 date
 module load anaconda3/2022.05 # Load module dependencies
 module load pytorch
 conda activate diffusers
 conda env list
+module list
 cat $0
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
     --resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
     --num_new_img_per_gpu 50 \
     --max_num_img_per_gpu 5 \
+    --gradient_accumulation_steps 60 \
 ######################################################################################

quantify_results.ipynb CHANGED Viewed

@@ -203,31 +203,82 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
@@ -2183,7 +2234,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "!module load pytorch\n",
+    "# !module list"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import torch"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['/storage/home/hcoda1/3/bxia34/.conda/envs/rh9_diffusers/lib/python3.12/site-packages/torch']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.__path__"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.cuda.is_available()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.cuda.device_count()"
+   ]
   },
   {
    "cell_type": "code",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,