recoilme commited on Jul 11, 2025

Commit

e458bd6

1 Parent(s): f84b3af

dispersive_loss_disabled

Browse files

Files changed (11) hide show

samples/unet_320x576_0.jpg +2 -2
samples/unet_384x576_0.jpg +2 -2
samples/unet_448x576_0.jpg +2 -2
samples/unet_512x576_0.jpg +2 -2
samples/unet_576x320_0.jpg +2 -2
samples/unet_576x384_0.jpg +2 -2
samples/unet_576x448_0.jpg +2 -2
samples/unet_576x512_0.jpg +2 -2
samples/unet_576x576_0.jpg +2 -2
train.py +23 -176
unet/diffusion_pytorch_model.safetensors +1 -1

samples/unet_320x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: e3e1eed66df8356d9a4f3abb4fca96b2750c05a48c64689b76a003e1c396f029
Pointer size: 131 Bytes
Size of remote file: 101 kB

Git LFS Details

SHA256: 9753bcbd8f3bdd79b1a981ba703c56413c3cdcb652957e22b7795223abf97ada
Pointer size: 130 Bytes
Size of remote file: 73.5 kB

samples/unet_384x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 807ca39bcd04ad0dbdcedde52bb7690f1bdb73c97e37bdb1262736461f791291
Pointer size: 130 Bytes
Size of remote file: 87.1 kB

Git LFS Details

SHA256: 56f54bd854da6bcb33baa894ff063cbdcb8a4f9371f214aa7979020f7a93a479
Pointer size: 130 Bytes
Size of remote file: 89.2 kB

samples/unet_448x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: a9579bb3c1d3b0e4bd76ad0834aae34a7d51ccecb675b6af42cceddece2d4f9a
Pointer size: 131 Bytes
Size of remote file: 109 kB

Git LFS Details

SHA256: 29c80e76b4191508c8b8af7f3be240a8fb0d94bd498e5966a65b78e4e0ffcd7e
Pointer size: 131 Bytes
Size of remote file: 114 kB

samples/unet_512x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 5dfbc40067f048ba11326fe65a21b3730cd95d0b2f274ee45d529046aa44880e
Pointer size: 131 Bytes
Size of remote file: 145 kB

Git LFS Details

SHA256: 370f5855f49baaa69979cc0fa830c698eb1afcf3e9ebcc73275c82518f65f5b2
Pointer size: 131 Bytes
Size of remote file: 113 kB

samples/unet_576x320_0.jpg CHANGED Viewed

Git LFS Details

SHA256: ce9de204808a5d7f42de8f3a445fc130847ce50e3e0fa05d26fa68153d2aee5b
Pointer size: 130 Bytes
Size of remote file: 66.1 kB

Git LFS Details

SHA256: bab97a9bb2d6d4fb9ced68b2afa38fbe88af438da6b9e80dcc1f4407368ff058
Pointer size: 130 Bytes
Size of remote file: 84.4 kB

samples/unet_576x384_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 637c8af557d8c7035cfe9209163dac867f722fcd6781ecdc7e1a46600a21c79a
Pointer size: 130 Bytes
Size of remote file: 46 kB

Git LFS Details

SHA256: fa3e8315e008779b32fae73687f443c74dee2582b59005fc033b3239d32e7a8b
Pointer size: 130 Bytes
Size of remote file: 82.9 kB

samples/unet_576x448_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 7438a12c17f5b7b52160202df63d5484c96fb3ade27f57c1c3f460da532d3452
Pointer size: 131 Bytes
Size of remote file: 104 kB

Git LFS Details

SHA256: 1570a01ed4a0b461b98e62afcbe0a9bb6a1d127264006ba67580a27305e67b79
Pointer size: 130 Bytes
Size of remote file: 81.2 kB

samples/unet_576x512_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 3ad86a4b554edf5c9a6677d78f55766874cdce0ccff24205a79b779a55674aef
Pointer size: 131 Bytes
Size of remote file: 108 kB

Git LFS Details

SHA256: e4afe8993c88d19b20b9fa4f4856150cfe0f210553d3ef7b9810051f0b70e370
Pointer size: 131 Bytes
Size of remote file: 145 kB

samples/unet_576x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 74334562d66b14cef1d6e879294999356ae3e6b2666c4f0fe1b8d066a5fb36cb
Pointer size: 131 Bytes
Size of remote file: 146 kB

Git LFS Details

SHA256: 70d1d7db450e142f7c60c05f615c95ef052a55a99f628480a81bf8ed07567305
Pointer size: 131 Bytes
Size of remote file: 176 kB

train.py CHANGED Viewed

@@ -27,7 +27,7 @@ import torch.nn.functional as F
 # --------------------------- Параметры ---------------------------
 ds_path = "datasets/576"
 project = "unet"
-batch_size = 25
 base_learning_rate = 9.5e-6
 min_learning_rate = 8.5e-6
 num_epochs = 20
@@ -43,20 +43,22 @@ unet_gradient = True
 clip_sample = False #Scheduler
 fixed_seed = False
 shuffle = True
-dispersive_loss = True
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.enable_mem_efficient_sdp(False)
 dtype = torch.float32
 save_barrier = 1.03
 dispersive_temperature=0.5
-dispersive_weight=0.05
 percentile_clipping = 95 # 8bit optim
-clip_grad_norm = 0.5
 steps_offset = 1 # Scheduler
 limit = 0
 checkpoints_folder = ""
-mixed_precision = "fp16"
 gradient_accumulation_steps = 2
 accelerator = Accelerator(
     mixed_precision=mixed_precision,
@@ -140,36 +142,6 @@ class AccelerateDispersiveLoss:
             # ВАЖНО: он отриц и должен падать
             return dispersive_loss
-    def compute_dispersive_loss2(self):
-        # Если нет активаций, возвращаем 0
-        if not self.activations:
-            return torch.tensor(0.0, device=self.accelerator.device, requires_grad=True)
-        # Работаем только с локальными активациями главного процесса
-        activations = self.activations[-1].float()
-        batch_size = activations.shape[0]
-        if batch_size < 2:
-            return torch.tensor(0.0, device=self.accelerator.device, requires_grad=True)
-        # Нормализация
-        norm = torch.norm(activations, dim=1, keepdim=True).clamp(min=1e-12)
-        sf = activations / norm
-        # Вычисляем расстояния
-        distance = torch.nn.functional.pdist(sf, p=2)
-        distance = distance.clamp(min=1e-12)
-        distance_squared = distance ** 2
-        # Вычисляем loss с клиппингом для стабильности
-        exp_neg_dist = torch.exp((-distance_squared / self.temperature).clamp(min=-20, max=20))
-        exp_neg_dist = exp_neg_dist + 1e-12
-        mean_exp = torch.mean(exp_neg_dist)
-        dispersive_loss = torch.log(mean_exp.clamp(min=1e-12))
-        return dispersive_loss
     def clear_activations(self):
         self.activations.clear()
@@ -177,140 +149,7 @@ class AccelerateDispersiveLoss:
         for hook in self.hooks:
             hook.remove()
         self.hooks.clear()
-class AccelerateDispersiveLoss2:
-    def __init__(self, accelerator, temperature=0.5, weight=0.5):
-        self.accelerator = accelerator
-        self.temperature = temperature
-        self.weight = weight
-        self.activations = []
-        self.hooks = []
-    def register_hooks(self, model, target_layer="down_blocks.0"):
-        # Получаем "чистую" модель без DDP wrapper'а
-        unwrapped_model = self.accelerator.unwrap_model(model)
-        print("=== Поиск слоев в unwrapped модели ===")
-        for name, module in unwrapped_model.named_modules():
-            if target_layer in name:
-                hook = module.register_forward_hook(self.hook_fn)
-                self.hooks.append(hook)
-                print(f"✅ Хук зарегистрирован на: {name}")
-                break
-    def hook_fn(self, module, input, output):
-        if isinstance(output, tuple):
-            activation = output[0]
-        else:
-            activation = output
-        if len(activation.shape) > 2:
-            activation = activation.view(activation.shape[0], -1)
-        self.activations.append(activation.detach())
-    def compute_dispersive_loss_fix(self):
-        if not self.activations:
-            return torch.tensor(0.0, requires_grad=True)
-        local_activations = self.activations[-1]
-        # Собираем активации со всех GPU
-        if self.accelerator.num_processes > 1:
-            gathered_activations = self.accelerator.gather(local_activations)
-        else:
-            gathered_activations = local_activations
-        batch_size = gathered_activations.shape[0]
-        if batch_size < 2:
-            return torch.tensor(0.0, requires_grad=True)
-        # Переводим в float32 для стабильности
-        gathered_activations = gathered_activations.float()
-        # Нормализация с eps для стабильности
-        norm = torch.norm(gathered_activations, dim=1, keepdim=True).clamp(min=1e-12)
-        sf = gathered_activations / norm
-        # Вычисляем расстояния
-        distance = torch.nn.functional.pdist(sf, p=2)
-        distance = distance.clamp(min=1e-12)  # избегаем слишком маленьких значений
-        distance_squared = distance ** 2
-        # Экспонента с клиппингом
-        exp_neg_dist = torch.exp((-distance_squared / self.temperature).clamp(min=-20, max=20))
-        exp_neg_dist = exp_neg_dist + 1e-12  # избегаем нулей
-        # Среднее и лог
-        mean_exp = torch.mean(exp_neg_dist)
-        dispersive_loss = torch.log(mean_exp.clamp(min=1e-12))
-        return dispersive_loss
-    def compute_dispersive_loss(self):
-        if not self.activations:
-            return torch.tensor(0.0, requires_grad=True)
-        local_activations = self.activations[-1].float()
-        # Собираем активации со всех GPU
-        if self.accelerator.num_processes > 1:
-            gathered_activations = self.accelerator.gather(local_activations)
-        else:
-            gathered_activations = local_activations
-        batch_size = gathered_activations.shape[0]
-        if batch_size < 2:
-            return torch.tensor(0.0, requires_grad=True)
-        # Нормализация и вычисление loss
-        sf = gathered_activations / torch.norm(gathered_activations, dim=1, keepdim=True)
-        sf = sf.float()
-        distance = torch.nn.functional.pdist(sf, p=2) ** 2
-        exp_neg_dist = torch.exp(-distance / self.temperature) + 1e-5
-        dispersive_loss = torch.log(torch.mean(exp_neg_dist))
-        # ВАЖНО: он отриц и должен падать
-        return dispersive_loss
-    def compute_dispersive_loss_single(self):
-        if not self.activations:
-            return torch.tensor(0.0, requires_grad=True)
-        local_activations = self.activations[-1]  # Активации с текущего GPU
-        # Собираем активации со всех GPU
-        if self.accelerator.num_processes > 1:
-            # Используем accelerate для сбора
-            gathered_activations = self.accelerator.gather(local_activations)
-        else:
-            gathered_activations = local_activations
-        # На главном процессе вычисляем loss
-        if self.accelerator.is_main_process:
-            batch_size = gathered_activations.shape[0]
-            if batch_size < 2:
-                return torch.tensor(0.0, requires_grad=True)
-            # Нормализация и вычисление loss
-            sf = gathered_activations / torch.norm(gathered_activations, dim=1, keepdim=True)
-            distance = torch.nn.functional.pdist(sf, p=2) ** 2
-            exp_neg_dist = torch.exp(-distance / self.temperature) + 1e-5
-            dispersive_loss = torch.log(torch.mean(exp_neg_dist))
-            return dispersive_loss
-        else:
-            # На не-главных процессах возвращаем 0
-            return torch.tensor(0.0, requires_grad=True)
-    def clear_activations(self):
-        self.activations.clear()
-    def remove_hooks(self):
-        for hook in self.hooks:
-            hook.remove()
-        self.hooks.clear()
 # --------------------------- Инициализация WandB ---------------------------
@@ -551,7 +390,7 @@ if os.path.isdir(latest_checkpoint):
          print(f"torch.nn.functional.get_flash_attention_available(): {torch.nn.functional.get_flash_attention_available()}")
     # Регистрируем хук на модел
-    if dispersive_loss:
         dispersive_hook = AccelerateDispersiveLoss(
             accelerator=accelerator,
             temperature=dispersive_temperature,
@@ -632,7 +471,7 @@ else:
 def create_optimizer(name, params):
     if name == "adam8bit":
         return bnb.optim.AdamW8bit(
-            params, lr=base_learning_rate, betas=(0.9, 0.97), eps=1e-5, weight_decay=0.001,
             percentile_clipping=percentile_clipping
         )
     elif name == "adam":
@@ -686,7 +525,7 @@ else:
     unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
 # Регистрация хуков ПОСЛЕ prepare
-if dispersive_loss:
     dispersive_hook.register_hooks(unet, "down_blocks.2")
 # --------------------------- Фиксированные семплы для генерации ---------------------------
@@ -867,7 +706,7 @@ for epoch in range(start_epoch, start_epoch + num_epochs):
             noisy_latents = scheduler.add_noise(latents, noise, timesteps)
             # Очищаем активации перед forward pass
-            if dispersive_loss:
                 dispersive_hook.clear_activations()
             # Используем целевое значение
@@ -880,7 +719,7 @@ for epoch in range(start_epoch, start_epoch + num_epochs):
             # Dispersive Loss
             #Идентичные векторы: Loss = -0.0000
             #Ортогональные векторы: Loss = -3.9995
-            if dispersive_loss:
                 with torch.amp.autocast('cuda', enabled=False):
                     dispersive_loss = dispersive_hook.weight * dispersive_hook.compute_dispersive_loss()
                     if torch.isnan(dispersive_loss) or torch.isinf(dispersive_loss):
@@ -888,13 +727,21 @@ for epoch in range(start_epoch, start_epoch + num_epochs):
             # Итоговый loss
             # dispersive_loss должен падать и тотал падать - поэтому плюс
-            total_loss = loss + dispersive_loss
             # Проверяем на nan/inf перед backward
             if torch.isnan(loss) or torch.isinf(loss):
                 print(f"Rank {accelerator.process_index}: Found nan/inf in loss: {loss}")
                 save_model = False
                 break
             if (global_step % 100 == 0) or (global_step % sample_interval == 0):
                 accelerator.wait_for_everyone()
@@ -938,8 +785,8 @@ for epoch in range(start_epoch, start_epoch + num_epochs):
                         "epoch": epoch,
                         "grad": grad,
                         "global_step": global_step,
-                        "dispersive_loss": dispersive_loss,
-                        "total_loss": total_loss
                     })
                 # Генерируем сэмплы с заданным интервалом

 # --------------------------- Параметры ---------------------------
 ds_path = "datasets/576"
 project = "unet"
+batch_size = 30
 base_learning_rate = 9.5e-6
 min_learning_rate = 8.5e-6
 num_epochs = 20
 clip_sample = False #Scheduler
 fixed_seed = False
 shuffle = True
+dispersive_loss_enabled = False
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.enable_mem_efficient_sdp(False)
 dtype = torch.float32
 save_barrier = 1.03
 dispersive_temperature=0.5
+dispersive_weight= 0.05
 percentile_clipping = 95 # 8bit optim
+betta2 = 0.99 #0.97
+eps = 1e-6 #1e-5
+clip_grad_norm = 1. #.5
 steps_offset = 1 # Scheduler
 limit = 0
 checkpoints_folder = ""
+mixed_precision = "bf16" #"fp16"
 gradient_accumulation_steps = 2
 accelerator = Accelerator(
     mixed_precision=mixed_precision,
             # ВАЖНО: он отриц и должен падать
             return dispersive_loss
     def clear_activations(self):
         self.activations.clear()
         for hook in self.hooks:
             hook.remove()
         self.hooks.clear()
 # --------------------------- Инициализация WandB ---------------------------
          print(f"torch.nn.functional.get_flash_attention_available(): {torch.nn.functional.get_flash_attention_available()}")
     # Регистрируем хук на модел
+    if dispersive_loss_enabled:
         dispersive_hook = AccelerateDispersiveLoss(
             accelerator=accelerator,
             temperature=dispersive_temperature,
 def create_optimizer(name, params):
     if name == "adam8bit":
         return bnb.optim.AdamW8bit(
+            params, lr=base_learning_rate, betas=(0.9, betta2), eps=eps, weight_decay=0.001,
             percentile_clipping=percentile_clipping
         )
     elif name == "adam":
     unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
 # Регистрация хуков ПОСЛЕ prepare
+if dispersive_loss_enabled:
     dispersive_hook.register_hooks(unet, "down_blocks.2")
 # --------------------------- Фиксированные семплы для генерации ---------------------------
             noisy_latents = scheduler.add_noise(latents, noise, timesteps)
             # Очищаем активации перед forward pass
+            if dispersive_loss_enabled:
                 dispersive_hook.clear_activations()
             # Используем целевое значение
             # Dispersive Loss
             #Идентичные векторы: Loss = -0.0000
             #Ортогональные векторы: Loss = -3.9995
+            if dispersive_loss_enabled:
                 with torch.amp.autocast('cuda', enabled=False):
                     dispersive_loss = dispersive_hook.weight * dispersive_hook.compute_dispersive_loss()
                     if torch.isnan(dispersive_loss) or torch.isinf(dispersive_loss):
             # Итоговый loss
             # dispersive_loss должен падать и тотал падать - поэтому плюс
+            if dispersive_loss_enabled:
+                total_loss = loss + dispersive_loss
+            else:
+                total_loss = loss
             # Проверяем на nan/inf перед backward
             if torch.isnan(loss) or torch.isinf(loss):
                 print(f"Rank {accelerator.process_index}: Found nan/inf in loss: {loss}")
                 save_model = False
                 break
+            if torch.isnan(total_loss) or torch.isinf(total_loss):
+                print(f"Rank {accelerator.process_index}: Found nan/inf in total_loss: {total_loss}")
+                print(f"Проблемный батч: step={step}, latents.shape={latents.shape}, embeddings.shape={embeddings.shape}")
+                continue
             if (global_step % 100 == 0) or (global_step % sample_interval == 0):
                 accelerator.wait_for_everyone()
                         "epoch": epoch,
                         "grad": grad,
                         "global_step": global_step,
+                        **({"dispersive_loss": dispersive_loss} if dispersive_loss_enabled else {}),
+                        **({"total_loss": total_loss} if dispersive_loss_enabled else {})
                     })
                 # Генерируем сэмплы с заданным интервалом

unet/diffusion_pytorch_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd195f44f05b8269dfc948cbc9046d1585b6c126485aba158c549a58dee09ae3
 size 7014306128

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e14b52a3c3ef2ae70a86b58094ade0d9dd40bee48b05a08a1fde027ed735e6f
 size 7014306128