Xsmos commited on
Commit
3fd64a1
·
verified ·
1 Parent(s): 9234e65
diffusion.py CHANGED
@@ -55,8 +55,8 @@ from tqdm.auto import tqdm
55
  import datetime
56
  from pathlib import Path
57
  #from diffusers.optimization import get_cosine_schedule_with_warmup
58
- from accelerate import notebook_launcher, Accelerator
59
- import accelerate
60
  #print("accelerate:", accelerate.__version__, accelerate.__path__)#, accelerate.__file__)
61
  from huggingface_hub import create_repo, upload_folder
62
 
@@ -484,13 +484,13 @@ class DDPM21CM:
484
  # plot_unet = True
485
 
486
  self.load()
487
- self.accelerator = Accelerator(
488
- mixed_precision=self.config.mixed_precision,
489
- gradient_accumulation_steps=self.config.gradient_accumulation_steps,
490
- log_with="tensorboard",
491
- project_dir=os.path.join(self.config.output_dir, "logs"),
492
  # distributed_type="MULTI_GPU",
493
- )
494
  # print("!!!!!!!!!!!!!!!!!!!self.accelerator.device:", self.accelerator.device)
495
  # if self.accelerator.is_main_process:
496
  if self.config.global_rank == 0: # or torch.cuda.current_device() == 0:
@@ -500,8 +500,8 @@ class DDPM21CM:
500
  self.repo_id = create_repo(
501
  repo_id=self.config.hub_model_id or Path(self.config.output_dir).name, exist_ok=True
502
  ).repo_id
503
- self.accelerator.init_trackers(f"{self.config.run_name}")
504
-
505
 
506
  # print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
507
  #model_start = time()
@@ -519,7 +519,7 @@ class DDPM21CM:
519
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
520
 
521
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
522
- acc_prep_start = time()
523
  #self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
524
  # self.accelerator.prepare(
525
  # self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
@@ -528,8 +528,8 @@ class DDPM21CM:
528
  #self.optimizer = self.accelerator.prepare(self.optimizer)
529
  #self.dataloader = self.accelerator.prepare(self.dataloader)
530
  #self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
531
- acc_prep_end = time()
532
- print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
533
  # self.nn_model, self.optimizer, self.lr_scheduler = \
534
  # self.accelerator.prepare(
535
  # self.nn_model, self.optimizer, self.lr_scheduler
@@ -553,26 +553,30 @@ class DDPM21CM:
553
  # print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
554
 
555
  # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
556
- with self.accelerator.accumulate(self.nn_model):
557
- x = x.to(self.config.device)
558
- # print("x = x.to(self.config.device), x.dtype =", x.dtype)
559
- x = x.to(self.config.dtype)
560
- # print("x = x.to(self.dtype), x.dtype =", x.dtype)
561
- # print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
562
- xt, noise, ts = self.ddpm.add_noise(x)
563
- # print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
564
- if self.config.guide_w == -1:
565
- noise_pred = self.nn_model(xt, ts).to(x.dtype)
566
- else:
567
- c = c.to(self.config.device)
568
- noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
569
 
570
  # print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
571
 
572
- loss = F.mse_loss(noise, noise_pred)
573
- #print(f"loss.dtype =", loss.dtype)
574
- self.accelerator.backward(loss)
575
- #self.accelerator.clip_grad_norm_(self.nn_model.parameters(), 1)
 
 
 
 
576
  self.optimizer.step()
577
  self.lr_scheduler.step()
578
  self.optimizer.zero_grad()
@@ -591,9 +595,20 @@ class DDPM21CM:
591
  )
592
  pbar_train.set_postfix(**logs)
593
 
594
- self.accelerator.log(logs, step=global_step)
 
 
 
595
  global_step += 1
596
 
 
 
 
 
 
 
 
 
597
  # if ep == config.n_epoch-1 or (ep+1)*config.save_period==1:
598
  self.save(ep)
599
  # # 检查参数和梯度的一致性
 
55
  import datetime
56
  from pathlib import Path
57
  #from diffusers.optimization import get_cosine_schedule_with_warmup
58
+ #from accelerate import notebook_launcher, Accelerator
59
+ #import accelerate
60
  #print("accelerate:", accelerate.__version__, accelerate.__path__)#, accelerate.__file__)
61
  from huggingface_hub import create_repo, upload_folder
62
 
 
484
  # plot_unet = True
485
 
486
  self.load()
487
+ #self.accelerator = Accelerator(
488
+ # mixed_precision=self.config.mixed_precision,
489
+ # gradient_accumulation_steps=self.config.gradient_accumulation_steps,
490
+ # log_with="tensorboard",
491
+ # project_dir=os.path.join(self.config.output_dir, "logs"),
492
  # distributed_type="MULTI_GPU",
493
+ #)
494
  # print("!!!!!!!!!!!!!!!!!!!self.accelerator.device:", self.accelerator.device)
495
  # if self.accelerator.is_main_process:
496
  if self.config.global_rank == 0: # or torch.cuda.current_device() == 0:
 
500
  self.repo_id = create_repo(
501
  repo_id=self.config.hub_model_id or Path(self.config.output_dir).name, exist_ok=True
502
  ).repo_id
503
+ #self.accelerator.init_trackers(f"{self.config.run_name}")
504
+ self.config.logger = SummaryWriter(f"logs/{self.config.run_name}")
505
 
506
  # print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
507
  #model_start = time()
 
519
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
520
 
521
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
522
+ #acc_prep_start = time()
523
  #self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
524
  # self.accelerator.prepare(
525
  # self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
 
528
  #self.optimizer = self.accelerator.prepare(self.optimizer)
529
  #self.dataloader = self.accelerator.prepare(self.dataloader)
530
  #self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
531
+ #acc_prep_end = time()
532
+ #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
533
  # self.nn_model, self.optimizer, self.lr_scheduler = \
534
  # self.accelerator.prepare(
535
  # self.nn_model, self.optimizer, self.lr_scheduler
 
553
  # print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
554
 
555
  # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
556
+ #with self.accelerator.accumulate(self.nn_model):
557
+ x = x.to(self.config.device)
558
+ # print("x = x.to(self.config.device), x.dtype =", x.dtype)
559
+ x = x.to(self.config.dtype)
560
+ # print("x = x.to(self.dtype), x.dtype =", x.dtype)
561
+ # print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
562
+ xt, noise, ts = self.ddpm.add_noise(x)
563
+ # print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
564
+ if self.config.guide_w == -1:
565
+ noise_pred = self.nn_model(xt, ts).to(x.dtype)
566
+ else:
567
+ c = c.to(self.config.device)
568
+ noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
569
 
570
  # print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
571
 
572
+ loss = F.mse_loss(noise, noise_pred)
573
+ loss = loss / self.config.gradient_accumulation_steps
574
+ loss.backward()
575
+ #print(f"loss.dtype =", loss.dtype)
576
+ #self.accelerator.backward(loss)
577
+ #self.accelerator.clip_grad_norm_(self.nn_model.parameters(), 1)
578
+ if (i+i) % self.config.gradient_accumulation_steps == 0:
579
+ torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
580
  self.optimizer.step()
581
  self.lr_scheduler.step()
582
  self.optimizer.zero_grad()
 
595
  )
596
  pbar_train.set_postfix(**logs)
597
 
598
+ #self.accelerator.log(logs, step=global_step)
599
+ if self.config.global_rank == 0:
600
+ self.config.logger.add_scalar("MSE", logs["loss"], global_step = global_step)
601
+ self.config.logger.add_scalar("learning_rate", logs["lr"], global_step = global_step)
602
  global_step += 1
603
 
604
+ if (i+i) % self.config.gradient_accumulation_steps != 0:
605
+ print(f"(i+1)%self.config.gradient_accumulation_steps = {(i+1)%self.config.gradient_accumulation_steps}".center(120,'-'))
606
+ torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
607
+ self.optimizer.step()
608
+ self.lr_scheduler.step()
609
+ self.optimizer.zero_grad()
610
+
611
+
612
  # if ep == config.n_epoch-1 or (ep+1)*config.save_period==1:
613
  self.save(ep)
614
  # # 检查参数和梯度的一致性
logs/05200138/events.out.tfevents.1725591706.nid001616.1370814.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65bd787f236d3f6a1868f10046680d2f5bb91ca170e868a01b42a14cea4f1509
3
+ size 90068