05200138
Browse files
diffusion.py
CHANGED
|
@@ -55,8 +55,8 @@ from tqdm.auto import tqdm
|
|
| 55 |
import datetime
|
| 56 |
from pathlib import Path
|
| 57 |
#from diffusers.optimization import get_cosine_schedule_with_warmup
|
| 58 |
-
from accelerate import notebook_launcher, Accelerator
|
| 59 |
-
import accelerate
|
| 60 |
#print("accelerate:", accelerate.__version__, accelerate.__path__)#, accelerate.__file__)
|
| 61 |
from huggingface_hub import create_repo, upload_folder
|
| 62 |
|
|
@@ -484,13 +484,13 @@ class DDPM21CM:
|
|
| 484 |
# plot_unet = True
|
| 485 |
|
| 486 |
self.load()
|
| 487 |
-
self.accelerator = Accelerator(
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
# distributed_type="MULTI_GPU",
|
| 493 |
-
)
|
| 494 |
# print("!!!!!!!!!!!!!!!!!!!self.accelerator.device:", self.accelerator.device)
|
| 495 |
# if self.accelerator.is_main_process:
|
| 496 |
if self.config.global_rank == 0: # or torch.cuda.current_device() == 0:
|
|
@@ -500,8 +500,8 @@ class DDPM21CM:
|
|
| 500 |
self.repo_id = create_repo(
|
| 501 |
repo_id=self.config.hub_model_id or Path(self.config.output_dir).name, exist_ok=True
|
| 502 |
).repo_id
|
| 503 |
-
self.accelerator.init_trackers(f"{self.config.run_name}")
|
| 504 |
-
|
| 505 |
|
| 506 |
# print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
|
| 507 |
#model_start = time()
|
|
@@ -519,7 +519,7 @@ class DDPM21CM:
|
|
| 519 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
|
| 520 |
|
| 521 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
|
| 522 |
-
acc_prep_start = time()
|
| 523 |
#self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
|
| 524 |
# self.accelerator.prepare(
|
| 525 |
# self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
|
|
@@ -528,8 +528,8 @@ class DDPM21CM:
|
|
| 528 |
#self.optimizer = self.accelerator.prepare(self.optimizer)
|
| 529 |
#self.dataloader = self.accelerator.prepare(self.dataloader)
|
| 530 |
#self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
|
| 531 |
-
acc_prep_end = time()
|
| 532 |
-
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
|
| 533 |
# self.nn_model, self.optimizer, self.lr_scheduler = \
|
| 534 |
# self.accelerator.prepare(
|
| 535 |
# self.nn_model, self.optimizer, self.lr_scheduler
|
|
@@ -553,26 +553,30 @@ class DDPM21CM:
|
|
| 553 |
# print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
|
| 554 |
|
| 555 |
# print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
|
| 556 |
-
with self.accelerator.accumulate(self.nn_model):
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
|
| 570 |
# print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
|
| 571 |
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
self.optimizer.step()
|
| 577 |
self.lr_scheduler.step()
|
| 578 |
self.optimizer.zero_grad()
|
|
@@ -591,9 +595,20 @@ class DDPM21CM:
|
|
| 591 |
)
|
| 592 |
pbar_train.set_postfix(**logs)
|
| 593 |
|
| 594 |
-
self.accelerator.log(logs, step=global_step)
|
|
|
|
|
|
|
|
|
|
| 595 |
global_step += 1
|
| 596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
# if ep == config.n_epoch-1 or (ep+1)*config.save_period==1:
|
| 598 |
self.save(ep)
|
| 599 |
# # 检查参数和梯度的一致性
|
|
|
|
| 55 |
import datetime
|
| 56 |
from pathlib import Path
|
| 57 |
#from diffusers.optimization import get_cosine_schedule_with_warmup
|
| 58 |
+
#from accelerate import notebook_launcher, Accelerator
|
| 59 |
+
#import accelerate
|
| 60 |
#print("accelerate:", accelerate.__version__, accelerate.__path__)#, accelerate.__file__)
|
| 61 |
from huggingface_hub import create_repo, upload_folder
|
| 62 |
|
|
|
|
| 484 |
# plot_unet = True
|
| 485 |
|
| 486 |
self.load()
|
| 487 |
+
#self.accelerator = Accelerator(
|
| 488 |
+
# mixed_precision=self.config.mixed_precision,
|
| 489 |
+
# gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
| 490 |
+
# log_with="tensorboard",
|
| 491 |
+
# project_dir=os.path.join(self.config.output_dir, "logs"),
|
| 492 |
# distributed_type="MULTI_GPU",
|
| 493 |
+
#)
|
| 494 |
# print("!!!!!!!!!!!!!!!!!!!self.accelerator.device:", self.accelerator.device)
|
| 495 |
# if self.accelerator.is_main_process:
|
| 496 |
if self.config.global_rank == 0: # or torch.cuda.current_device() == 0:
|
|
|
|
| 500 |
self.repo_id = create_repo(
|
| 501 |
repo_id=self.config.hub_model_id or Path(self.config.output_dir).name, exist_ok=True
|
| 502 |
).repo_id
|
| 503 |
+
#self.accelerator.init_trackers(f"{self.config.run_name}")
|
| 504 |
+
self.config.logger = SummaryWriter(f"logs/{self.config.run_name}")
|
| 505 |
|
| 506 |
# print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
|
| 507 |
#model_start = time()
|
|
|
|
| 519 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
|
| 520 |
|
| 521 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
|
| 522 |
+
#acc_prep_start = time()
|
| 523 |
#self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
|
| 524 |
# self.accelerator.prepare(
|
| 525 |
# self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
|
|
|
|
| 528 |
#self.optimizer = self.accelerator.prepare(self.optimizer)
|
| 529 |
#self.dataloader = self.accelerator.prepare(self.dataloader)
|
| 530 |
#self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
|
| 531 |
+
#acc_prep_end = time()
|
| 532 |
+
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
|
| 533 |
# self.nn_model, self.optimizer, self.lr_scheduler = \
|
| 534 |
# self.accelerator.prepare(
|
| 535 |
# self.nn_model, self.optimizer, self.lr_scheduler
|
|
|
|
| 553 |
# print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
|
| 554 |
|
| 555 |
# print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
|
| 556 |
+
#with self.accelerator.accumulate(self.nn_model):
|
| 557 |
+
x = x.to(self.config.device)
|
| 558 |
+
# print("x = x.to(self.config.device), x.dtype =", x.dtype)
|
| 559 |
+
x = x.to(self.config.dtype)
|
| 560 |
+
# print("x = x.to(self.dtype), x.dtype =", x.dtype)
|
| 561 |
+
# print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
|
| 562 |
+
xt, noise, ts = self.ddpm.add_noise(x)
|
| 563 |
+
# print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
|
| 564 |
+
if self.config.guide_w == -1:
|
| 565 |
+
noise_pred = self.nn_model(xt, ts).to(x.dtype)
|
| 566 |
+
else:
|
| 567 |
+
c = c.to(self.config.device)
|
| 568 |
+
noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
|
| 569 |
|
| 570 |
# print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
|
| 571 |
|
| 572 |
+
loss = F.mse_loss(noise, noise_pred)
|
| 573 |
+
loss = loss / self.config.gradient_accumulation_steps
|
| 574 |
+
loss.backward()
|
| 575 |
+
#print(f"loss.dtype =", loss.dtype)
|
| 576 |
+
#self.accelerator.backward(loss)
|
| 577 |
+
#self.accelerator.clip_grad_norm_(self.nn_model.parameters(), 1)
|
| 578 |
+
if (i+i) % self.config.gradient_accumulation_steps == 0:
|
| 579 |
+
torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
|
| 580 |
self.optimizer.step()
|
| 581 |
self.lr_scheduler.step()
|
| 582 |
self.optimizer.zero_grad()
|
|
|
|
| 595 |
)
|
| 596 |
pbar_train.set_postfix(**logs)
|
| 597 |
|
| 598 |
+
#self.accelerator.log(logs, step=global_step)
|
| 599 |
+
if self.config.global_rank == 0:
|
| 600 |
+
self.config.logger.add_scalar("MSE", logs["loss"], global_step = global_step)
|
| 601 |
+
self.config.logger.add_scalar("learning_rate", logs["lr"], global_step = global_step)
|
| 602 |
global_step += 1
|
| 603 |
|
| 604 |
+
if (i+i) % self.config.gradient_accumulation_steps != 0:
|
| 605 |
+
print(f"(i+1)%self.config.gradient_accumulation_steps = {(i+1)%self.config.gradient_accumulation_steps}".center(120,'-'))
|
| 606 |
+
torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
|
| 607 |
+
self.optimizer.step()
|
| 608 |
+
self.lr_scheduler.step()
|
| 609 |
+
self.optimizer.zero_grad()
|
| 610 |
+
|
| 611 |
+
|
| 612 |
# if ep == config.n_epoch-1 or (ep+1)*config.save_period==1:
|
| 613 |
self.save(ep)
|
| 614 |
# # 检查参数和梯度的一致性
|
logs/05200138/events.out.tfevents.1725591706.nid001616.1370814.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65bd787f236d3f6a1868f10046680d2f5bb91ca170e868a01b42a14cea4f1509
|
| 3 |
+
size 90068
|