recoilme commited on Sep 11, 2025

Commit

5bbef16

1 Parent(s): b9a43be

init

Browse files

Files changed (17) hide show

.gitignore +1 -1
butterfly.zip → datasets/butterfly/data-00000-of-00001.arrow +2 -2
datasets/butterfly/dataset_info.json +47 -0
datasets/butterfly/state.json +20 -0
requirements.txt +0 -3
samples/unet_192x384_0.jpg +3 -0
samples/unet_256x384_0.jpg +3 -0
samples/unet_320x384_0.jpg +3 -0
samples/unet_384x192_0.jpg +3 -0
samples/unet_384x256_0.jpg +3 -0
samples/unet_384x320_0.jpg +3 -0
src/dataset_from_folder.py +37 -75
src/dataset_sample.ipynb +3 -9
src/model_create.ipynb +299 -180
train.py +7 -7
unet/config.json +78 -0
unet/diffusion_pytorch_model.safetensors +3 -0

.gitignore CHANGED Viewed

@@ -7,7 +7,7 @@ __pycache__/
 src/samples
 # cache
 cache
-datasets
 test
 wandb
 nohup.out

 src/samples
 # cache
 cache
+# datasets
 test
 wandb
 nohup.out

butterfly.zip → datasets/butterfly/data-00000-of-00001.arrow RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b923bef9a5d1fe7103e960c943c110ec46155fc71d7f45e0070f3ef072bbdcb
-size 237918081

 version https://git-lfs.github.com/spec/v1
+oid sha256:d8479e8b4cf0c3505189c608cedf8b35ab073f14c6b7db0a9e66b75925e1c519
+size 53255512

datasets/butterfly/dataset_info.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "image_path": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "vae": {
+      "feature": {
+        "feature": {
+          "feature": {
+            "dtype": "float16",
+            "_type": "Value"
+          },
+          "_type": "List"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "embeddings": {
+      "feature": {
+        "feature": {
+          "dtype": "float32",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "width": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "height": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

datasets/butterfly/state.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "23217366db2250df",
+  "_format_columns": [
+    "image_path",
+    "text",
+    "vae",
+    "embeddings",
+    "width",
+    "height"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

requirements.txt CHANGED Viewed

@@ -1,6 +1,3 @@
-# torch>=2.6.0
-# torchvision>=0.21.0
-# torchaudio>=2.6.0
 diffusers>=0.32.2
 accelerate>=1.5.2
 datasets>=3.5.0

 diffusers>=0.32.2
 accelerate>=1.5.2
 datasets>=3.5.0

samples/unet_192x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: f8d593e4370ad0523c154177853b0a0494792995654d3d7d1fc8baf43e519d35
Pointer size: 130 Bytes
Size of remote file: 26 kB

samples/unet_256x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: 3d972079e6277012a49509f8d529cbb5540acdaa1e32267c3879db2ecce1c7a3
Pointer size: 130 Bytes
Size of remote file: 46.7 kB

samples/unet_320x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: 7a9ea056f95fdc5041d89e342a6ba83f37de3280771b4d3907a51d558d75bf83
Pointer size: 130 Bytes
Size of remote file: 59.7 kB

samples/unet_384x192_0.jpg ADDED Viewed

Git LFS Details

SHA256: 55b102073dcf0bc45d7b7ca96d0b151d638f7875f1da86c5f2ea44fddf1e2e72
Pointer size: 130 Bytes
Size of remote file: 26.4 kB

samples/unet_384x256_0.jpg ADDED Viewed

Git LFS Details

SHA256: 3f9ab248800ebf6c8c52b88ed8ee3bd27ec6f104ceae35d04944f988f6b99c33
Pointer size: 130 Bytes
Size of remote file: 25.1 kB

samples/unet_384x320_0.jpg ADDED Viewed

Git LFS Details

SHA256: acc0960f81837a0e161f9faa57916819f0325015720173cc1a13b997e0aa0631
Pointer size: 130 Bytes
Size of remote file: 55.3 kB

src/dataset_from_folder.py CHANGED Viewed

@@ -24,10 +24,8 @@ batch_size = 5
 min_size = 192 #256 #192
 max_size = 384 #256 #384
 step = 64
-img_share = 1.0
 empty_share = 0.05
 limit = 0
-textemb_full = False
 # Основная процедура обработки
 folder_path = "/workspace/butterfly" #alchemist"
 save_path = "/workspace/sdxs3d/datasets/butterfly" #"alchemist"
@@ -44,18 +42,13 @@ def clear_cuda_memory():
 # ---------------- 2️⃣ Загрузка моделей ----------------
 def load_models():
     print("Загрузка моделей...")
-    #vae = AutoencoderKLWan.from_pretrained("AiArtLab/simplevae",subfolder="wan16x_vae_nightly",torch_dtype=dtype).to(device).eval()
-    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", subfolder=None,torch_dtype=dtype).to(device).eval()
-    #vae = AutoencoderKL.from_pretrained("AiArtLab/simplevae",subfolder="simple_vae_nightly",torch_dtype=dtype).to(device).eval()
-    #vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-schnell",subfolder="vae",torch_dtype=dtype).to(device).eval()
-    #vae = AutoencoderKL.from_pretrained("/home/recoilme/sdxs/vae", variant="fp16",torch_dtype=dtype).to(device).eval()
-    model = AutoModel.from_pretrained("visheratin/mexma-siglip2", dtype=dtype, trust_remote_code=True, optimized=True).to(device).eval()
-    processor = AutoImageProcessor.from_pretrained("visheratin/mexma-siglip2", use_fast=True)
-    tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip2")
-    return vae, model, processor, tokenizer
-vae, model, processor, tokenizer = load_models()
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:
@@ -124,57 +117,39 @@ def get_image_transform(min_size=256, max_size=512, step=64):
     return transform
 # ---------------- 4️⃣ Функции обработки ----------------
-def encode_images_batch(images, processor, model, empty_share=0.0):
-    """
-    images: список PIL.Image
-    processor: трансформер для препроцессинга изображений
-    model: vision encoder (например, CLIP или подобный)
-    empty_share: доля эмбеддингов, которые нужно обнулить
-    """
-    # Преобразуем весь батч сразу (вместо обхода по каждому изображению)
-    processed = processor(images=images, return_tensors="pt")
-    pixel_values = processed["pixel_values"].to(device, dtype)
     with torch.inference_mode():
-        outputs = model.vision_model(pixel_values)
-        #hidden_states = outputs.last_hidden_state  # [B, seq_len, dim]
-        pooled = outputs.pooler_output            # [B, dim]
-        # Добавляем pooled embedding в конец sequence
-        #context = torch.cat([hidden_states, pooled.unsqueeze(1)], dim=1)  # [B, seq_len+1, dim]
-        context = pooled.unsqueeze(1)
-        # Добавляем нулевые эмбеддинги с вероятностью empty_share
-        if empty_share > 0:
-            batch_size = context.shape[0]
-            num_empty = int(batch_size * empty_share)
-            if num_empty > 0:
-                zero_embeddings = torch.zeros_like(context[:num_empty])
-                context[:num_empty] = zero_embeddings
-    # Преобразуем bfloat16 в float32 если нужно
-    if context.dtype == torch.bfloat16:
-        context = context.to(torch.float32)
-    return context.cpu().numpy()  # [B, seq_len+1, dim]
-def encode_texts_batch(texts, tokenizer, model):
-    with torch.inference_mode():
-        text_tokenized = tokenizer(texts, return_tensors="pt", padding="max_length",
-            max_length=512,
-            truncation=True).to(device)
-        text_embeddings = model.encode_texts(text_tokenized.input_ids, text_tokenized.attention_mask)
-    return text_embeddings.unsqueeze(1).cpu().numpy()
-def encode_texts_batch_full(texts, tokenizer, model):
-    with torch.inference_mode():
-        text_tokenized = tokenizer(texts, return_tensors="pt", padding="max_length",max_length=512,truncation=True).to(device)
-        features = model.text_model(
-            input_ids=text_tokenized.input_ids, attention_mask=text_tokenized.attention_mask
-        ).last_hidden_state
-        features_proj = model.text_projector(features)
-    return features_proj.cpu().numpy()
 def clean_label(label):
     label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "")
@@ -236,28 +211,15 @@ def encode_to_latents(images, texts):
         # Кодируем батч
         with torch.no_grad():
             posteriors = vae.encode(batch_tensor).latent_dist.mode()
             latents = (posteriors - shift_factor) / scaling_factor
-            if latents_mean!=None and latents_std!=None:
-                latents = (latents - torch.tensor(latents_mean, device=device, dtype=dtype).view(1, -1, 1, 1, 1)) / torch.tensor(latents_std, device=device, dtype=dtype).view(1, -1, 1, 1, 1)
-            #print(latents.ndim, latents.shape)
-            if latents.ndim==5:
-                latents = latents[:, :, 0, :, :]  # Убираем временную ось [B, C, H, W]
         latents_np = latents.to(dtype).cpu().numpy()
         # Обрабатываем тексты
         text_labels = [clean_label(text) for text in texts]
-        if random.random() < img_share:
-            embeddings = encode_images_batch(pil_images, processor, model)
-            text_labels = [f"img: {label}" for label in text_labels]
-        else:
-            model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
-            if textemb_full:
-                embeddings = encode_texts_batch_full(model_prompts, tokenizer, model)
-            else:
-                embeddings = encode_texts_batch(model_prompts, tokenizer, model)
         return {
             "vae": latents_np,

 min_size = 192 #256 #192
 max_size = 384 #256 #384
 step = 64
 empty_share = 0.05
 limit = 0
 # Основная процедура обработки
 folder_path = "/workspace/butterfly" #alchemist"
 save_path = "/workspace/sdxs3d/datasets/butterfly" #"alchemist"
 # ---------------- 2️⃣ Загрузка моделей ----------------
 def load_models():
     print("Загрузка моделей...")
+    vae = AutoencoderKL.from_pretrained("AiArtLab/simplevae",subfolder="simple_vae_nightly",torch_dtype=dtype).to(device).eval()
+    tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
+    model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B').to("cuda")
+    return vae, model, tokenizer
+vae, model, tokenizer = load_models()
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:
     return transform
 # ---------------- 4️⃣ Функции обработки ----------------
+def last_token_pool(last_hidden_states: torch.Tensor,
+                    attention_mask: torch.Tensor) -> torch.Tensor:
+    # Определяем, есть ли left padding
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=512, normalize=False):
     with torch.inference_mode():
+        # Токенизация
+        batch = tokenizer(
+            texts,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=max_length
+        ).to(device)
+        # Прогон через модель
+        outputs = model(**batch)
+        # Пулинг по last token
+        embeddings = last_token_pool(outputs.last_hidden_state, batch["attention_mask"])
+        # L2-нормализация (опционально, обычно нужна для семантического поиска)
+        if normalize:
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+    return embeddings.unsqueeze(1).cpu().numpy()
 def clean_label(label):
     label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "")
         # Кодируем батч
         with torch.no_grad():
             posteriors = vae.encode(batch_tensor).latent_dist.mode()
             latents = (posteriors - shift_factor) / scaling_factor
         latents_np = latents.to(dtype).cpu().numpy()
         # Обрабатываем тексты
         text_labels = [clean_label(text) for text in texts]
+        model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
+        embeddings = encode_texts_batch(model_prompts, tokenizer, model)
         return {
             "vae": latents_np,

src/dataset_sample.ipynb CHANGED Viewed

@@ -202,12 +202,8 @@
     "    \n",
     "    # Загрузка VAE модели\n",
     "    print(\"Загрузка VAE модели...\")\n",
-    "    #vae = AutoencoderKLWan.from_pretrained(\n",
-    "    #    \"AiArtLab/simplevae\", subfolder=\"wan16x_vae_nightly\",\n",
-    "    #    torch_dtype=dtype\n",
-    "    #).to(device).eval()\n",
-    "    vae = AutoencoderKL.from_pretrained(\"madebyollin/sdxl-vae-fp16-fix\", subfolder=None,torch_dtype=dtype).to(device).eval()\n",
-    "\n",
     "    shift_factor = getattr(vae.config, \"shift_factor\", 0.0)\n",
     "    if shift_factor is None:\n",
     "        shift_factor = 0.0\n",
@@ -248,8 +244,6 @@
     "        print(f\"\\n--- Батч {width}x{height}: {count} примеров ---\")\n",
     "        \n",
     "        latent = torch.tensor(example[\"vae\"], dtype=dtype).to(device)\n",
-    "        #if latent.ndim == 3:\n",
-    "        #    latent = latent.unsqueeze(1)\n",
     "        # Латент  в форме [C, T, H, W]\n",
     "        print(latent.ndim, latent.shape)\n",
     "        with torch.no_grad():\n",
@@ -331,7 +325,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
   }
  },
  "nbformat": 4,

     "    \n",
     "    # Загрузка VAE модели\n",
     "    print(\"Загрузка VAE модели...\")\n",
+    "    vae = AutoencoderKL.from_pretrained(\"AiArtLab/simplevae\",subfolder=\"simple_vae_nightly\",torch_dtype=dtype).to(device).eval()\n",
+    "    \n",
     "    shift_factor = getattr(vae.config, \"shift_factor\", 0.0)\n",
     "    if shift_factor is None:\n",
     "        shift_factor = 0.0\n",
     "        print(f\"\\n--- Батч {width}x{height}: {count} примеров ---\")\n",
     "        \n",
     "        latent = torch.tensor(example[\"vae\"], dtype=dtype).to(device)\n",
     "        # Латент  в форме [C, T, H, W]\n",
     "        print(latent.ndim, latent.shape)\n",
     "        with torch.no_grad():\n",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

src/model_create.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "5212f806-14b4-4b5f-bcb4-09e36df3b7d9",
    "metadata": {},
    "outputs": [
@@ -11,164 +11,223 @@
      "output_type": "stream",
      "text": [
       "test unet\n",
-      "Количество параметров: 1616742724\n",
-      "Output shape: torch.Size([1, 4, 60, 48])\n",
       "UNet2DConditionModel(\n",
-      "  (conv_in): Conv2d(4, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "  (time_proj): Timesteps()\n",
       "  (time_embedding): TimestepEmbedding(\n",
-      "    (linear_1): Linear(in_features=288, out_features=1152, bias=True)\n",
       "    (act): SiLU()\n",
-      "    (linear_2): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "  )\n",
       "  (down_blocks): ModuleList(\n",
       "    (0): DownBlock2D(\n",
       "      (resnets): ModuleList(\n",
       "        (0-1): 2 x ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 288, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(288, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=288, bias=True)\n",
-      "          (norm2): GroupNorm(32, 288, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
       "        )\n",
       "      )\n",
       "      (downsamplers): ModuleList(\n",
       "        (0): Downsample2D(\n",
-      "          (conv): Conv2d(288, 288, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (1): CrossAttnDownBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-1): 2 x Transformer2DModel(\n",
-      "          (norm): GroupNorm(32, 576, eps=1e-06, affine=True)\n",
-      "          (proj_in): Linear(in_features=576, out_features=576, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
       "            (0): BasicTransformerBlock(\n",
-      "              (norm1): LayerNorm((576,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
-      "                (to_q): Linear(in_features=576, out_features=576, bias=False)\n",
-      "                (to_k): Linear(in_features=576, out_features=576, bias=False)\n",
-      "                (to_v): Linear(in_features=576, out_features=576, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=576, out_features=576, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm2): LayerNorm((576,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
-      "                (to_q): Linear(in_features=576, out_features=576, bias=False)\n",
-      "                (to_k): Linear(in_features=1152, out_features=576, bias=False)\n",
-      "                (to_v): Linear(in_features=1152, out_features=576, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=576, out_features=576, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm3): LayerNorm((576,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
-      "                    (proj): Linear(in_features=576, out_features=4608, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
-      "                  (2): Linear(in_features=2304, out_features=576, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
-      "          (proj_out): Linear(in_features=576, out_features=576, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 288, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(288, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=576, bias=True)\n",
-      "          (norm2): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(288, 576, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=576, bias=True)\n",
-      "          (norm2): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
       "        )\n",
       "      )\n",
       "      (downsamplers): ModuleList(\n",
       "        (0): Downsample2D(\n",
-      "          (conv): Conv2d(576, 576, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (2): CrossAttnDownBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-1): 2 x Transformer2DModel(\n",
-      "          (norm): GroupNorm(32, 1152, eps=1e-06, affine=True)\n",
-      "          (proj_in): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
-      "            (0-7): 8 x BasicTransformerBlock(\n",
-      "              (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
-      "                (to_q): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_k): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_v): Linear(in_features=1152, out_features=1152, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
-      "                (to_q): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_k): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_v): Linear(in_features=1152, out_features=1152, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm3): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
-      "                    (proj): Linear(in_features=1152, out_features=9216, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
-      "                  (2): Linear(in_features=4608, out_features=1152, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
-      "          (proj_out): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(576, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
-      "          (norm2): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(576, 1152, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
-      "          (norm2): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
       "        )\n",
       "      )\n",
@@ -178,174 +237,234 @@
       "    (0): CrossAttnUpBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-2): 3 x Transformer2DModel(\n",
-      "          (norm): GroupNorm(32, 1152, eps=1e-06, affine=True)\n",
-      "          (proj_in): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
       "            (0-7): 8 x BasicTransformerBlock(\n",
-      "              (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
-      "                (to_q): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_k): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_v): Linear(in_features=1152, out_features=1152, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
-      "                (to_q): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_k): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "                (to_v): Linear(in_features=1152, out_features=1152, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm3): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
-      "                    (proj): Linear(in_features=1152, out_features=9216, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
-      "                  (2): Linear(in_features=4608, out_features=1152, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
-      "          (proj_out): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0-1): 2 x ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 2304, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(2304, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
-      "          (norm2): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(2304, 1152, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (2): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 1728, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(1728, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
-      "          (norm2): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(1728, 1152, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "      )\n",
       "      (upsamplers): ModuleList(\n",
       "        (0): Upsample2D(\n",
-      "          (conv): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
-      "    (1): CrossAttnUpBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-2): 3 x Transformer2DModel(\n",
-      "          (norm): GroupNorm(32, 576, eps=1e-06, affine=True)\n",
-      "          (proj_in): Linear(in_features=576, out_features=576, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
       "            (0): BasicTransformerBlock(\n",
-      "              (norm1): LayerNorm((576,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
-      "                (to_q): Linear(in_features=576, out_features=576, bias=False)\n",
-      "                (to_k): Linear(in_features=576, out_features=576, bias=False)\n",
-      "                (to_v): Linear(in_features=576, out_features=576, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=576, out_features=576, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm2): LayerNorm((576,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
-      "                (to_q): Linear(in_features=576, out_features=576, bias=False)\n",
-      "                (to_k): Linear(in_features=1152, out_features=576, bias=False)\n",
-      "                (to_v): Linear(in_features=1152, out_features=576, bias=False)\n",
       "                (to_out): ModuleList(\n",
-      "                  (0): Linear(in_features=576, out_features=576, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
-      "              (norm3): LayerNorm((576,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
-      "                    (proj): Linear(in_features=576, out_features=4608, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
-      "                  (2): Linear(in_features=2304, out_features=576, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
-      "          (proj_out): Linear(in_features=576, out_features=576, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 1728, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(1728, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=576, bias=True)\n",
-      "          (norm2): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(1728, 576, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(1152, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=576, bias=True)\n",
-      "          (norm2): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(1152, 576, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (2): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 864, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(864, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=576, bias=True)\n",
-      "          (norm2): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(864, 576, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "      )\n",
       "      (upsamplers): ModuleList(\n",
       "        (0): Upsample2D(\n",
-      "          (conv): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
-      "    (2): UpBlock2D(\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 864, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(864, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=288, bias=True)\n",
-      "          (norm2): GroupNorm(32, 288, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(864, 288, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1-2): 2 x ResnetBlock2D(\n",
-      "          (norm1): GroupNorm(32, 576, eps=1e-05, affine=True)\n",
-      "          (conv1): Conv2d(576, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "          (time_emb_proj): Linear(in_features=1152, out_features=288, bias=True)\n",
-      "          (norm2): GroupNorm(32, 288, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
-      "          (conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
-      "          (conv_shortcut): Conv2d(576, 288, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
@@ -353,60 +472,60 @@
       "  (mid_block): UNetMidBlock2DCrossAttn(\n",
       "    (attentions): ModuleList(\n",
       "      (0): Transformer2DModel(\n",
-      "        (norm): GroupNorm(32, 1152, eps=1e-06, affine=True)\n",
-      "        (proj_in): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "        (transformer_blocks): ModuleList(\n",
       "          (0-7): 8 x BasicTransformerBlock(\n",
-      "            (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "            (attn1): Attention(\n",
-      "              (to_q): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "              (to_k): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "              (to_v): Linear(in_features=1152, out_features=1152, bias=False)\n",
       "              (to_out): ModuleList(\n",
-      "                (0): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "                (1): Dropout(p=0.0, inplace=False)\n",
       "              )\n",
       "            )\n",
-      "            (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "            (attn2): Attention(\n",
-      "              (to_q): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "              (to_k): Linear(in_features=1152, out_features=1152, bias=False)\n",
-      "              (to_v): Linear(in_features=1152, out_features=1152, bias=False)\n",
       "              (to_out): ModuleList(\n",
-      "                (0): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "                (1): Dropout(p=0.0, inplace=False)\n",
       "              )\n",
       "            )\n",
-      "            (norm3): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
       "            (ff): FeedForward(\n",
       "              (net): ModuleList(\n",
       "                (0): GEGLU(\n",
-      "                  (proj): Linear(in_features=1152, out_features=9216, bias=True)\n",
       "                )\n",
       "                (1): Dropout(p=0.0, inplace=False)\n",
-      "                (2): Linear(in_features=4608, out_features=1152, bias=True)\n",
       "              )\n",
       "            )\n",
       "          )\n",
       "        )\n",
-      "        (proj_out): Linear(in_features=1152, out_features=1152, bias=True)\n",
       "      )\n",
       "    )\n",
       "    (resnets): ModuleList(\n",
       "      (0-1): 2 x ResnetBlock2D(\n",
-      "        (norm1): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
-      "        (conv1): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
-      "        (time_emb_proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
-      "        (norm2): GroupNorm(32, 1152, eps=1e-05, affine=True)\n",
       "        (dropout): Dropout(p=0.0, inplace=False)\n",
-      "        (conv2): Conv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "        (nonlinearity): SiLU()\n",
       "      )\n",
       "    )\n",
       "  )\n",
-      "  (conv_norm_out): GroupNorm(32, 288, eps=1e-05, affine=True)\n",
       "  (conv_act): SiLU()\n",
-      "  (conv_out): Conv2d(288, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       ")\n"
      ]
     }
@@ -414,11 +533,11 @@
    "source": [
     "config_sdxs = {\n",
     "    # === Основные размеры и каналы ===\n",
-    "    \"in_channels\": 4,               # Количество входных каналов (совместимость с VAE)\n",
-    "    \"out_channels\": 4,              # Количество выходных каналов (симметрично in_channels)          \n",
     "\n",
     "    # === Cross-Attention ===\n",
-    "    \"cross_attention_dim\": 1152,      # Размерность текстовых эмбеддингов\n",
     "    \"use_linear_projection\": True,\n",
     "    \"norm_num_groups\": 32,\n",
     "    \n",
@@ -427,20 +546,20 @@
     "        \"DownBlock2D\",\n",
     "        \"CrossAttnDownBlock2D\",\n",
     "        \"CrossAttnDownBlock2D\",\n",
-    "        #\"CrossAttnDownBlock2D\",\n",
     "    ],\n",
     "    \"up_block_types\": [   # декодер\n",
-    "        #\"CrossAttnUpBlock2D\",\n",
     "        \"CrossAttnUpBlock2D\",\n",
     "        \"CrossAttnUpBlock2D\",\n",
     "        \"UpBlock2D\",\n",
     "    ],\n",
     "\n",
     "    # === Конфигурация каналов ===\n",
-    "    \"block_out_channels\": [288, 576, 1152],\n",
     "\n",
-    "    \"transformer_layers_per_block\": [1, 1, 8],\n",
-    "    \"attention_head_dim\": [6, 9, 18],\n",
     "}\n",
     "\n",
     "def check_initialization(model):\n",
@@ -465,9 +584,9 @@
     "    print(f\"Количество параметров: {num_params}\")\n",
     "\n",
     "    # Генерация тестового латента (640x512 в latent space)\n",
-    "    test_latent = torch.randn(1,4, 60, 48).to(\"cuda\", dtype=torch.float16)  # 60x48 ≈ 512px\n",
     "    timesteps = torch.tensor([1]).to(\"cuda\", dtype=torch.float16)\n",
-    "    encoder_hidden_states = torch.randn(1, 77, 1152).to(\"cuda\", dtype=torch.float16)\n",
     "    \n",
     "    with torch.no_grad():\n",
     "        output = new_unet(\n",
@@ -506,7 +625,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
   }
  },
  "nbformat": 4,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "5212f806-14b4-4b5f-bcb4-09e36df3b7d9",
    "metadata": {},
    "outputs": [
      "output_type": "stream",
      "text": [
       "test unet\n",
+      "Количество параметров: 1546186256\n",
+      "Output shape: torch.Size([1, 16, 60, 48])\n",
       "UNet2DConditionModel(\n",
+      "  (conv_in): Conv2d(16, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "  (time_proj): Timesteps()\n",
       "  (time_embedding): TimestepEmbedding(\n",
+      "    (linear_1): Linear(in_features=256, out_features=1024, bias=True)\n",
       "    (act): SiLU()\n",
+      "    (linear_2): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "  )\n",
       "  (down_blocks): ModuleList(\n",
       "    (0): DownBlock2D(\n",
       "      (resnets): ModuleList(\n",
       "        (0-1): 2 x ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 256, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=256, bias=True)\n",
+      "          (norm2): GroupNorm(32, 256, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
       "        )\n",
       "      )\n",
       "      (downsamplers): ModuleList(\n",
       "        (0): Downsample2D(\n",
+      "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (1): CrossAttnDownBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-1): 2 x Transformer2DModel(\n",
+      "          (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (proj_in): Linear(in_features=512, out_features=512, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
       "            (0): BasicTransformerBlock(\n",
+      "              (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
+      "                (to_q): Linear(in_features=512, out_features=512, bias=False)\n",
+      "                (to_k): Linear(in_features=512, out_features=512, bias=False)\n",
+      "                (to_v): Linear(in_features=512, out_features=512, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=512, out_features=512, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
+      "                (to_q): Linear(in_features=512, out_features=512, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=512, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=512, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=512, out_features=512, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
+      "                    (proj): Linear(in_features=512, out_features=4096, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                  (2): Linear(in_features=2048, out_features=512, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
+      "          (proj_out): Linear(in_features=512, out_features=512, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 256, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=512, bias=True)\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=512, bias=True)\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
       "        )\n",
       "      )\n",
       "      (downsamplers): ModuleList(\n",
       "        (0): Downsample2D(\n",
+      "          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (2): CrossAttnDownBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-1): 2 x Transformer2DModel(\n",
+      "          (norm): GroupNorm(32, 1024, eps=1e-06, affine=True)\n",
+      "          (proj_in): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
+      "            (0): BasicTransformerBlock(\n",
+      "              (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
+      "                    (proj): Linear(in_features=1024, out_features=8192, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                  (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
+      "          (proj_out): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (nonlinearity): SiLU()\n",
+      "        )\n",
+      "      )\n",
+      "      (downsamplers): ModuleList(\n",
+      "        (0): Downsample2D(\n",
+      "          (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (3): CrossAttnDownBlock2D(\n",
+      "      (attentions): ModuleList(\n",
+      "        (0-1): 2 x Transformer2DModel(\n",
+      "          (norm): GroupNorm(32, 1024, eps=1e-06, affine=True)\n",
+      "          (proj_in): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (transformer_blocks): ModuleList(\n",
+      "            (0-7): 8 x BasicTransformerBlock(\n",
+      "              (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+      "              (attn1): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                )\n",
+      "              )\n",
+      "              (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+      "              (attn2): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                )\n",
+      "              )\n",
+      "              (norm3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+      "              (ff): FeedForward(\n",
+      "                (net): ModuleList(\n",
+      "                  (0): GEGLU(\n",
+      "                    (proj): Linear(in_features=1024, out_features=8192, bias=True)\n",
+      "                  )\n",
+      "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                  (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "                )\n",
+      "              )\n",
+      "            )\n",
+      "          )\n",
+      "          (proj_out): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "        )\n",
+      "      )\n",
+      "      (resnets): ModuleList(\n",
+      "        (0-1): 2 x ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
+      "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
       "        )\n",
       "      )\n",
       "    (0): CrossAttnUpBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-2): 3 x Transformer2DModel(\n",
+      "          (norm): GroupNorm(32, 1024, eps=1e-06, affine=True)\n",
+      "          (proj_in): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
       "            (0-7): 8 x BasicTransformerBlock(\n",
+      "              (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
+      "                    (proj): Linear(in_features=1024, out_features=8192, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                  (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
+      "          (proj_out): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "        )\n",
+      "      )\n",
+      "      (resnets): ModuleList(\n",
+      "        (0-2): 3 x ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 2048, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(2048, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
+      "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(2048, 1024, kernel_size=(1, 1), stride=(1, 1))\n",
+      "        )\n",
+      "      )\n",
+      "      (upsamplers): ModuleList(\n",
+      "        (0): Upsample2D(\n",
+      "          (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (1): CrossAttnUpBlock2D(\n",
+      "      (attentions): ModuleList(\n",
+      "        (0-2): 3 x Transformer2DModel(\n",
+      "          (norm): GroupNorm(32, 1024, eps=1e-06, affine=True)\n",
+      "          (proj_in): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (transformer_blocks): ModuleList(\n",
+      "            (0): BasicTransformerBlock(\n",
+      "              (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+      "              (attn1): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                )\n",
+      "              )\n",
+      "              (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+      "              (attn2): Attention(\n",
+      "                (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                )\n",
+      "              )\n",
+      "              (norm3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+      "              (ff): FeedForward(\n",
+      "                (net): ModuleList(\n",
+      "                  (0): GEGLU(\n",
+      "                    (proj): Linear(in_features=1024, out_features=8192, bias=True)\n",
+      "                  )\n",
+      "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                  (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "                )\n",
+      "              )\n",
+      "            )\n",
+      "          )\n",
+      "          (proj_out): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0-1): 2 x ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 2048, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(2048, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(2048, 1024, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (2): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 1536, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(1536, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(1536, 1024, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "      )\n",
       "      (upsamplers): ModuleList(\n",
       "        (0): Upsample2D(\n",
+      "          (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
+      "    (2): CrossAttnUpBlock2D(\n",
       "      (attentions): ModuleList(\n",
       "        (0-2): 3 x Transformer2DModel(\n",
+      "          (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (proj_in): Linear(in_features=512, out_features=512, bias=True)\n",
       "          (transformer_blocks): ModuleList(\n",
       "            (0): BasicTransformerBlock(\n",
+      "              (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn1): Attention(\n",
+      "                (to_q): Linear(in_features=512, out_features=512, bias=False)\n",
+      "                (to_k): Linear(in_features=512, out_features=512, bias=False)\n",
+      "                (to_v): Linear(in_features=512, out_features=512, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=512, out_features=512, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
       "              (attn2): Attention(\n",
+      "                (to_q): Linear(in_features=512, out_features=512, bias=False)\n",
+      "                (to_k): Linear(in_features=1024, out_features=512, bias=False)\n",
+      "                (to_v): Linear(in_features=1024, out_features=512, bias=False)\n",
       "                (to_out): ModuleList(\n",
+      "                  (0): Linear(in_features=512, out_features=512, bias=True)\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
       "                )\n",
       "              )\n",
+      "              (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
       "              (ff): FeedForward(\n",
       "                (net): ModuleList(\n",
       "                  (0): GEGLU(\n",
+      "                    (proj): Linear(in_features=512, out_features=4096, bias=True)\n",
       "                  )\n",
       "                  (1): Dropout(p=0.0, inplace=False)\n",
+      "                  (2): Linear(in_features=2048, out_features=512, bias=True)\n",
       "                )\n",
       "              )\n",
       "            )\n",
       "          )\n",
+      "          (proj_out): Linear(in_features=512, out_features=512, bias=True)\n",
       "        )\n",
       "      )\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 1536, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(1536, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=512, bias=True)\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(1536, 512, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(1024, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=512, bias=True)\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (2): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 768, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(768, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=512, bias=True)\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(768, 512, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "      )\n",
       "      (upsamplers): ModuleList(\n",
       "        (0): Upsample2D(\n",
+      "          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
+      "    (3): UpBlock2D(\n",
       "      (resnets): ModuleList(\n",
       "        (0): ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 768, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(768, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=256, bias=True)\n",
+      "          (norm2): GroupNorm(32, 256, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "        (1-2): 2 x ResnetBlock2D(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-05, affine=True)\n",
+      "          (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (time_emb_proj): Linear(in_features=1024, out_features=256, bias=True)\n",
+      "          (norm2): GroupNorm(32, 256, eps=1e-05, affine=True)\n",
       "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "          (nonlinearity): SiLU()\n",
+      "          (conv_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  (mid_block): UNetMidBlock2DCrossAttn(\n",
       "    (attentions): ModuleList(\n",
       "      (0): Transformer2DModel(\n",
+      "        (norm): GroupNorm(32, 1024, eps=1e-06, affine=True)\n",
+      "        (proj_in): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "        (transformer_blocks): ModuleList(\n",
       "          (0-7): 8 x BasicTransformerBlock(\n",
+      "            (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "            (attn1): Attention(\n",
+      "              (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "              (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "              (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "              (to_out): ModuleList(\n",
+      "                (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                (1): Dropout(p=0.0, inplace=False)\n",
       "              )\n",
       "            )\n",
+      "            (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "            (attn2): Attention(\n",
+      "              (to_q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "              (to_k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+      "              (to_v): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "              (to_out): ModuleList(\n",
+      "                (0): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                (1): Dropout(p=0.0, inplace=False)\n",
       "              )\n",
       "            )\n",
+      "            (norm3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "            (ff): FeedForward(\n",
       "              (net): ModuleList(\n",
       "                (0): GEGLU(\n",
+      "                  (proj): Linear(in_features=1024, out_features=8192, bias=True)\n",
       "                )\n",
       "                (1): Dropout(p=0.0, inplace=False)\n",
+      "                (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "              )\n",
       "            )\n",
       "          )\n",
       "        )\n",
+      "        (proj_out): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "      )\n",
       "    )\n",
       "    (resnets): ModuleList(\n",
       "      (0-1): 2 x ResnetBlock2D(\n",
+      "        (norm1): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
+      "        (conv1): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "        (time_emb_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "        (norm2): GroupNorm(32, 1024, eps=1e-05, affine=True)\n",
       "        (dropout): Dropout(p=0.0, inplace=False)\n",
+      "        (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "        (nonlinearity): SiLU()\n",
       "      )\n",
       "    )\n",
       "  )\n",
+      "  (conv_norm_out): GroupNorm(32, 256, eps=1e-05, affine=True)\n",
       "  (conv_act): SiLU()\n",
+      "  (conv_out): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       ")\n"
      ]
     }
    "source": [
     "config_sdxs = {\n",
     "    # === Основные размеры и каналы ===\n",
+    "    \"in_channels\": 16,               # Количество входных каналов (совместимость с VAE)\n",
+    "    \"out_channels\": 16,              # Количество выходных каналов (симметрично in_channels)          \n",
     "\n",
     "    # === Cross-Attention ===\n",
+    "    \"cross_attention_dim\": 1024,      # Размерность текстовых эмбеддингов\n",
     "    \"use_linear_projection\": True,\n",
     "    \"norm_num_groups\": 32,\n",
     "    \n",
     "        \"DownBlock2D\",\n",
     "        \"CrossAttnDownBlock2D\",\n",
     "        \"CrossAttnDownBlock2D\",\n",
+    "        \"CrossAttnDownBlock2D\",\n",
     "    ],\n",
     "    \"up_block_types\": [   # декодер\n",
+    "        \"CrossAttnUpBlock2D\",\n",
     "        \"CrossAttnUpBlock2D\",\n",
     "        \"CrossAttnUpBlock2D\",\n",
     "        \"UpBlock2D\",\n",
     "    ],\n",
     "\n",
     "    # === Конфигурация каналов ===\n",
+    "    \"block_out_channels\": [256, 512, 1024, 1024],\n",
     "\n",
+    "    \"transformer_layers_per_block\": [1, 1, 1, 8],\n",
+    "    \"attention_head_dim\": [4, 8, 16, 16],\n",
     "}\n",
     "\n",
     "def check_initialization(model):\n",
     "    print(f\"Количество параметров: {num_params}\")\n",
     "\n",
     "    # Генерация тестового латента (640x512 в latent space)\n",
+    "    test_latent = torch.randn(1, 16, 60, 48).to(\"cuda\", dtype=torch.float16)  # 60x48 ≈ 512px\n",
     "    timesteps = torch.tensor([1]).to(\"cuda\", dtype=torch.float16)\n",
+    "    encoder_hidden_states = torch.randn(1, 77, 1024).to(\"cuda\", dtype=torch.float16)\n",
     "    \n",
     "    with torch.no_grad():\n",
     "        output = new_unet(\n",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

train.py CHANGED Viewed

@@ -31,10 +31,10 @@ project = "unet"
 batch_size = 16
 base_learning_rate = 9e-5
 min_learning_rate = 1e-5
-num_epochs = 30
 # samples/save per epoch
 sample_interval_share = 1
-use_wandb = False
 save_model = True
 use_decay = True
 fbp = False # fused backward pass
@@ -89,10 +89,10 @@ if fixed_seed:
 # --- Пропорции лоссов и окно медианного нормирования (КОЭФ., не значения) ---
 # CHANGED: добавлен huber и dispersive в пропорции, суммы = 1.0
 loss_ratios = {
-    "mse":   0.60,
-    "mae":   0.35,
     "huber": 0.0,
-    "dispersive": 0.05,
 }
 median_coeff_steps = 128  # за сколько шагов считать медианные коэффициенты
@@ -110,7 +110,7 @@ def sample_timesteps_bias(
     num_train_timesteps: int,  # обычно 1000
     steps_offset: int = 0,
     device=None,
-    mode: str = "beta",        # "beta", "uniform"
 ) -> torch.Tensor:
     """
     Возвращает timesteps с разным bias:
@@ -241,7 +241,7 @@ gen.manual_seed(seed)
 #        "AiArtLab/simplevae", subfolder="wan16x_vae_nightly",
 #        torch_dtype=dtype
 #    ).to(device="cpu").eval()
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", subfolder=None,torch_dtype=dtype).to(device).eval()
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:

 batch_size = 16
 base_learning_rate = 9e-5
 min_learning_rate = 1e-5
+num_epochs = 300
 # samples/save per epoch
 sample_interval_share = 1
+use_wandb = True
 save_model = True
 use_decay = True
 fbp = False # fused backward pass
 # --- Пропорции лоссов и окно медианного нормирования (КОЭФ., не значения) ---
 # CHANGED: добавлен huber и dispersive в пропорции, суммы = 1.0
 loss_ratios = {
+    "mse":   1.0,
+    "mae":   0.0,
     "huber": 0.0,
+    "dispersive": 0.0,
 }
 median_coeff_steps = 128  # за сколько шагов считать медианные коэффициенты
     num_train_timesteps: int,  # обычно 1000
     steps_offset: int = 0,
     device=None,
+    mode: str = "uniform",        # "beta", "uniform"
 ) -> torch.Tensor:
     """
     Возвращает timesteps с разным bias:
 #        "AiArtLab/simplevae", subfolder="wan16x_vae_nightly",
 #        torch_dtype=dtype
 #    ).to(device="cpu").eval()
+vae = AutoencoderKL.from_pretrained("AiArtLab/simplevae",subfolder="simple_vae_nightly",torch_dtype=dtype).to(device).eval()
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:

unet/config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.35.1",
+  "_name_or_path": "unet",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": [
+    4,
+    8,
+    16,
+    16
+  ],
+  "attention_type": "default",
+  "block_out_channels": [
+    256,
+    512,
+    1024,
+    1024
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 1024,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "DownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 16,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 16,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": null,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": [
+    1,
+    1,
+    1,
+    8
+  ],
+  "up_block_types": [
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "UpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": true
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74a909271318a4d576b1519be1697f2d7989534c89fa4b5ae0f7a7fdd04a9245
+size 6184944280