0811-170519
Browse files- context_unet.py +2 -2
- diffusion.py +22 -8
- phoenix_diffusion.sbatch +3 -3
- quantify_results.ipynb +0 -0
context_unet.py
CHANGED
|
@@ -533,7 +533,7 @@ class ContextUnet(nn.Module):
|
|
| 533 |
text_outputs = self.token_embedding(y.to(self.dtype))
|
| 534 |
emb = emb + text_outputs.to(emb)
|
| 535 |
|
| 536 |
-
|
| 537 |
h = x.type(self.dtype)
|
| 538 |
#print("0,h.shape =", h.shape)
|
| 539 |
for module in self.input_blocks:
|
|
@@ -551,7 +551,7 @@ class ContextUnet(nn.Module):
|
|
| 551 |
h = module(h, emb)
|
| 552 |
# print("module decoder, h.shape =", h.shape)
|
| 553 |
|
| 554 |
-
|
| 555 |
h = h.type(x.dtype)
|
| 556 |
h = self.out(h)
|
| 557 |
#print("self.out(h)", "h.shape =", h.shape)
|
|
|
|
| 533 |
text_outputs = self.token_embedding(y.to(self.dtype))
|
| 534 |
emb = emb + text_outputs.to(emb)
|
| 535 |
|
| 536 |
+
print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
|
| 537 |
h = x.type(self.dtype)
|
| 538 |
#print("0,h.shape =", h.shape)
|
| 539 |
for module in self.input_blocks:
|
|
|
|
| 551 |
h = module(h, emb)
|
| 552 |
# print("module decoder, h.shape =", h.shape)
|
| 553 |
|
| 554 |
+
print("h = h.type(x.dtype), x.dtype =", x.dtype)
|
| 555 |
h = h.type(x.dtype)
|
| 556 |
h = self.out(h)
|
| 557 |
#print("self.out(h)", "h.shape =", h.shape)
|
diffusion.py
CHANGED
|
@@ -208,9 +208,9 @@ class DDPMScheduler(nn.Module):
|
|
| 208 |
# x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
|
| 209 |
|
| 210 |
# print("x_i.shape =", x_i.shape)
|
| 211 |
-
|
| 212 |
x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
|
| 213 |
-
|
| 214 |
|
| 215 |
pbar_sample.update(1)
|
| 216 |
|
|
@@ -268,7 +268,7 @@ class TrainConfig:
|
|
| 268 |
# dim = 2
|
| 269 |
dim = 3#2
|
| 270 |
stride = (2,4) if dim == 2 else (2,2,2)
|
| 271 |
-
num_image =
|
| 272 |
batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 273 |
n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 274 |
HII_DIM = 64
|
|
@@ -313,8 +313,8 @@ class TrainConfig:
|
|
| 313 |
# params = params
|
| 314 |
# data_dir = './data' # data directory
|
| 315 |
|
| 316 |
-
use_fp16 = True
|
| 317 |
-
dtype = torch.
|
| 318 |
mixed_precision = "fp16"
|
| 319 |
gradient_accumulation_steps = 1
|
| 320 |
|
|
@@ -522,9 +522,9 @@ class DDPM21CM:
|
|
| 522 |
# print("x = x.to(self.config.device), x.dtype =", x.dtype)
|
| 523 |
# x = x.to(self.config.dtype)
|
| 524 |
# print("x = x.to(self.dtype), x.dtype =", x.dtype)
|
| 525 |
-
|
| 526 |
xt, noise, ts = self.ddpm.add_noise(x)
|
| 527 |
-
|
| 528 |
if self.config.guide_w == -1:
|
| 529 |
noise_pred = self.nn_model(xt, ts)
|
| 530 |
else:
|
|
@@ -644,6 +644,20 @@ class DDPM21CM:
|
|
| 644 |
# nn_model = ContextUnet(n_param=1, image_size=28)
|
| 645 |
# nn_model.train()
|
| 646 |
# self.nn_model.to(self.ddpm.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
self.nn_model.eval()
|
| 648 |
|
| 649 |
# self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
|
|
@@ -751,7 +765,7 @@ if __name__ == "__main__":
|
|
| 751 |
max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
|
| 752 |
config = TrainConfig()
|
| 753 |
#config.world_size = world_size
|
| 754 |
-
|
| 755 |
config.resume = args.resume
|
| 756 |
# config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
|
| 757 |
# config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"
|
|
|
|
| 208 |
# x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
|
| 209 |
|
| 210 |
# print("x_i.shape =", x_i.shape)
|
| 211 |
+
print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
|
| 212 |
x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
|
| 213 |
+
print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
|
| 214 |
|
| 215 |
pbar_sample.update(1)
|
| 216 |
|
|
|
|
| 268 |
# dim = 2
|
| 269 |
dim = 3#2
|
| 270 |
stride = (2,4) if dim == 2 else (2,2,2)
|
| 271 |
+
num_image = 30#00#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 272 |
batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 273 |
n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 274 |
HII_DIM = 64
|
|
|
|
| 313 |
# params = params
|
| 314 |
# data_dir = './data' # data directory
|
| 315 |
|
| 316 |
+
#use_fp16 = True
|
| 317 |
+
dtype = torch.float32 #if use_fp16 else torch.float32
|
| 318 |
mixed_precision = "fp16"
|
| 319 |
gradient_accumulation_steps = 1
|
| 320 |
|
|
|
|
| 522 |
# print("x = x.to(self.config.device), x.dtype =", x.dtype)
|
| 523 |
# x = x.to(self.config.dtype)
|
| 524 |
# print("x = x.to(self.dtype), x.dtype =", x.dtype)
|
| 525 |
+
print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
|
| 526 |
xt, noise, ts = self.ddpm.add_noise(x)
|
| 527 |
+
print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
|
| 528 |
if self.config.guide_w == -1:
|
| 529 |
noise_pred = self.nn_model(xt, ts)
|
| 530 |
else:
|
|
|
|
| 644 |
# nn_model = ContextUnet(n_param=1, image_size=28)
|
| 645 |
# nn_model.train()
|
| 646 |
# self.nn_model.to(self.ddpm.device)
|
| 647 |
+
|
| 648 |
+
self.accelerator = Accelerator(
|
| 649 |
+
mixed_precision=self.config.mixed_precision,
|
| 650 |
+
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
| 651 |
+
log_with="tensorboard",
|
| 652 |
+
project_dir=os.path.join(self.config.output_dir, "logs"),
|
| 653 |
+
# distributed_type="MULTI_GPU",
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
self.nn_model, self.optimizer, self.lr_scheduler = \
|
| 657 |
+
self.accelerator.prepare(
|
| 658 |
+
self.nn_model, self.optimizer, self.lr_scheduler
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
self.nn_model.eval()
|
| 662 |
|
| 663 |
# self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
|
|
|
|
| 765 |
max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
|
| 766 |
config = TrainConfig()
|
| 767 |
#config.world_size = world_size
|
| 768 |
+
# config.dtype = torch.float32
|
| 769 |
config.resume = args.resume
|
| 770 |
# config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
|
| 771 |
# config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"
|
phoenix_diffusion.sbatch
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
#SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB # Number of nodes and cores per node required
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=16G # Memory per core
|
| 8 |
-
#SBATCH -t
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|
| 10 |
#SBATCH --error=error-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
|
|
@@ -30,9 +30,9 @@ export MASTER_PORT=$MASTER_PORT
|
|
| 30 |
|
| 31 |
srun python diffusion.py \
|
| 32 |
--train 1 \
|
| 33 |
-
--resume outputs/model_state-
|
| 34 |
--num_new_img_per_gpu 50 \
|
| 35 |
-
--max_num_img_per_gpu
|
| 36 |
|
| 37 |
######################################################################################
|
| 38 |
|
|
|
|
| 5 |
#SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB # Number of nodes and cores per node required
|
| 6 |
#SBATCH --ntasks-per-node=1
|
| 7 |
#SBATCH --mem-per-gpu=16G # Memory per core
|
| 8 |
+
#SBATCH -t 00:30:00 # Duration of the job (Ex: 15 mins)
|
| 9 |
#SBATCH -oReport-%j # Combined output and error messages file
|
| 10 |
#SBATCH --error=error-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
|
|
|
|
| 30 |
|
| 31 |
srun python diffusion.py \
|
| 32 |
--train 1 \
|
| 33 |
+
--resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
|
| 34 |
--num_new_img_per_gpu 50 \
|
| 35 |
+
--max_num_img_per_gpu 5 \
|
| 36 |
|
| 37 |
######################################################################################
|
| 38 |
|
quantify_results.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|