babkasotona commited on Apr 29

Commit

58bb2b7

verified ·

1 Parent(s): 9074517

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitattributes +4 -0
.gitignore +21 -0
Untitled.ipynb +139 -0
dataset.py +300 -0
dataset_sample.ipynb +170 -0
model_index.json +24 -0
pipeline_sdxs.py +348 -0
pipeline_sdxs_t5.py +291 -0
scheduler/.ipynb_checkpoints/scheduler_config-checkpoint.json +22 -0
scheduler/scheduler_config.json +22 -0
t.py +116 -0
test.ipynb +3 -0
text_encoder/.ipynb_checkpoints/config-checkpoint.json +101 -0
text_encoder/config.json +101 -0
text_encoder/model.safetensors +3 -0
tokenizer/chat_template.jinja +154 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +32 -0
train-Copy1.py +924 -0
transformer/config.json +37 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
vae/.ipynb_checkpoints/config-checkpoint.json +56 -0
vae/config.json +56 -0
vae/diffusion_pytorch_model.safetensors +3 -0
wandb/debug-cli.root.log +0 -0
wandb/debug-internal.log +0 -0
wandb/debug.log +19 -0
wandb/offline-run-20260428_132658-o9052r27/files/requirements.txt +117 -0
wandb/offline-run-20260428_132658-o9052r27/logs/debug-core.log +14 -0
wandb/offline-run-20260428_132658-o9052r27/logs/debug-internal.log +15 -0
wandb/offline-run-20260428_132658-o9052r27/logs/debug.log +21 -0
wandb/offline-run-20260428_132658-o9052r27/run-o9052r27.wandb +0 -0
wandb/run-20260428_171645-wt40fdyx/files/output.log +385 -0
wandb/run-20260428_171645-wt40fdyx/files/requirements.txt +117 -0
wandb/run-20260428_171645-wt40fdyx/files/wandb-metadata.json +46 -0
wandb/run-20260428_171645-wt40fdyx/logs/debug-core.log +7 -0
wandb/run-20260428_171645-wt40fdyx/logs/debug-internal.log +0 -0
wandb/run-20260428_171645-wt40fdyx/logs/debug.log +19 -0
wandb/run-20260428_171645-wt40fdyx/run-wt40fdyx.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+media/refined.jpg filter=lfs diff=lfs merge=lfs -text
+test.ipynb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20260428_171645-wt40fdyx/run-wt40fdyx.wandb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# Jupyter Notebook
+__pycache__/
+*.pyc
+.ipynb_checkpoints/
+*.ipynb_checkpoints/*
+.ipynb_checkpoints/*
+src/samples
+# cache
+cache
+datasets
+test
+wandb
+nohup.out
+samples/
+transformer/
+*.jpg
+*.png
+datasets/
+samples/
+*.jpg
+train.py

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7e8f9dc5-d07a-4538-bc03-8953412a72fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Keyword arguments {'safety_checker': <__main__.DummyCosmosSafetyChecker object at 0x7f7e8c3fb620>} are not expected by SdxsPipeline and will be ignored.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "99e2522e93064308b5dd34923a133c39",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "def23cc245b2470a9012f70d5e4c78ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading weights:   0%|          | 0/195 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The config attributes {'final_sigmas_type': 'sigma_min', 'sigma_data': 1.0, 'sigma_max': 80.0, 'sigma_min': 0.002} were passed to FlowMatchEulerDiscreteScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.\n",
+      "Sampling: 100%|██████████| 40/40 [00:10<00:00,  3.65it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Готово! Изображение сохранено как output.png\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from diffusers import Cosmos2TextToImagePipeline\n",
+    "\n",
+    "class DummyCosmosSafetyChecker:\n",
+    "    def to(self, *args, **kwargs):\n",
+    "        return self\n",
+    "        \n",
+    "    def eval(self):\n",
+    "        return self\n",
+    "\n",
+    "    # Обход проверки текста\n",
+    "    def check_text_safety(self, prompt, *args, **kwargs):\n",
+    "        return True\n",
+    "\n",
+    "    # Обход проверки \"видео\" (картинки из 1 кадра)\n",
+    "    def check_video_safety(self, vid, *args, **kwargs):\n",
+    "        # Просто возвращаем тензор обратно без изменений\n",
+    "        return vid\n",
+    "\n",
+    "    # На всякий случай оставляем оригинальный __call__\n",
+    "    def __call__(self, images, **kwargs):\n",
+    "        return images, [False] * len(images)\n",
+    "\n",
+    "model_id = \"/workspace/sdxs-2b\"\n",
+    "\n",
+    "pipe = Cosmos2TextToImagePipeline.from_pretrained(\n",
+    "    model_id,\n",
+    "    safety_checker=DummyCosmosSafetyChecker(), \n",
+    "    torch_dtype=torch.bfloat16                 \n",
+    ")\n",
+    "pipe.to(\"cuda\")\n",
+    "\n",
+    "prompt = \"In a serene garden, two young girls stand side by side, their youthful energy palpable. The girl on the left, adorned with a blue dress and a matching blue flower in her hair, gazes directly at the viewer, her eyes sparkling with curiosity.\"#\"There is a young male character standing against a vibrant, colorful graffiti wall. he is wearing a  hat, a  jacket adorned with gold accents, and black shorts.\"\n",
+    "negative_prompt = \"The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.\"\n",
+    "\n",
+    "# 3. Генерируем изображение\n",
+    "output = pipe(\n",
+    "    height = 1024,\n",
+    "    width=1024,\n",
+    "    prompt=prompt, \n",
+    "    negative_prompt=negative_prompt, \n",
+    "    generator=torch.Generator(device=\"cuda\").manual_seed(1)\n",
+    ").images[0]\n",
+    "\n",
+    "output.save(\"output.png\")\n",
+    "print(\"Готово! Изображение сохранено как output.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a173167-6c28-4bbd-8879-1375e0fd37f0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

dataset.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# pip install flash-attn --no-build-isolation
+import torch
+import os
+import gc
+import numpy as np
+import random
+import json
+import shutil
+import time
+from datasets import Dataset, load_from_disk, concatenate_datasets
+from diffusers import AutoencoderKLQwenImage
+from torchvision.transforms import Resize, ToTensor, Normalize, Compose, InterpolationMode, Lambda
+from transformers import AutoModel, AutoImageProcessor, AutoTokenizer, AutoModelForCausalLM
+from typing import Dict, List, Tuple, Optional, Any
+from PIL import Image
+from tqdm import tqdm
+from datetime import timedelta
+from accelerate import Accelerator
+accelerator = Accelerator()
+device = accelerator.device
+is_main_process = accelerator.is_main_process
+process_index = accelerator.process_index
+num_processes = accelerator.num_processes
+# ---------------- 1️⃣ Настройки ----------------
+dtype = torch.float16
+batch_size = 5
+min_size = 320
+max_size = 640
+step = 32
+empty_share = 0.0
+limit = 0
+folder_path = "/workspace/dataset/d23"
+save_path = "/workspace/ds234_640_vae_qwen"
+os.makedirs(save_path, exist_ok=True)
+def clear_cuda_memory():
+    if torch.cuda.is_available():
+        used_gb = torch.cuda.max_memory_allocated() / 1024**3
+        print(f"[GPU {process_index}] used_gb: {used_gb:.2f} GB")
+        torch.cuda.empty_cache()
+        gc.collect()
+# ---------------- 2️⃣ Загрузка моделей ----------------
+def load_models():
+    print(f"[GPU {process_index}] Загрузка моделей...")
+    vae = AutoencoderKLQwenImage.from_pretrained("vae", torch_dtype=dtype).to(device).eval()
+    return vae
+vae = load_models()
+shift_factor = getattr(vae.config, "shift_factor", 0.0) or 0.0
+scaling_factor = getattr(vae.config, "scaling_factor", 1.0) or 1.0
+mean = getattr(vae.config, "latents_mean", None)
+std = getattr(vae.config, "latents_std", None)
+if mean is not None and std is not None:
+    latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1, 1)
+    latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1, 1)
+# ---------------- 3️⃣ Трансформации ----------------
+def get_image_transform(min_size=256, max_size=512, step=64):
+    def transform(img, dry_run=False):
+        original_width, original_height = img.size
+        if original_width >= original_height:
+            new_width = max_size
+            new_height = int(max_size * original_height / original_width)
+        else:
+            new_height = max_size
+            new_width = int(max_size * original_width / original_height)
+        if new_height < min_size or new_width < min_size:
+            if original_width <= original_height:
+                new_width = min_size
+                new_height = int(min_size * original_height / original_width)
+            else:
+                new_height = min_size
+                new_width = int(min_size * original_width / original_height)
+        crop_width = min(max_size, (new_width // step) * step)
+        crop_height = min(max_size, (new_height // step) * step)
+        crop_width = max(min_size, crop_width)
+        crop_height = max(min_size, crop_height)
+        if dry_run:
+            return crop_width, crop_height
+        img_resized = img.convert("RGB").resize((new_width, new_height), Image.LANCZOS)
+        top = (new_height - crop_height) // 3
+        left = 0
+        img_cropped = img_resized.crop((left, top, left + crop_width, top + crop_height))
+        final_width, final_height = img_cropped.size
+        img_tensor = ToTensor()(img_cropped)
+        img_tensor = Normalize(mean=[0.5]*3, std=[0.5]*3)(img_tensor)
+        return img_tensor, img_cropped, final_width, final_height
+    return transform
+# ---------------- 4️⃣ Функции обработки ----------------
+def clean_label(label):
+    label = label.replace("Image 1","").replace("Image 2","").replace("Image 3","").replace("Image 4","")
+    label = label.replace("The image depicts ","").replace("The image presents ","")
+    label = label.replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
+    if label.startswith("."):
+        label = label[1:].lstrip()
+    return label
+def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
+    labels_for_model = []
+    labels_for_logging = []
+    for label in original_labels:
+        if random.random() < prob_to_make_empty:
+            labels_for_model.append("")
+            labels_for_logging.append(f"zero: {label}")
+        else:
+            labels_for_model.append(label)
+            labels_for_logging.append(label)
+    return labels_for_model, labels_for_logging
+def encode_to_latents(images, texts):
+    transform = get_image_transform(min_size, max_size, step)
+    transformed_tensors = []
+    widths, heights = [], []
+    for img in images:
+        try:
+            t_img, _, w, h = transform(img)
+            transformed_tensors.append(t_img)
+            widths.append(w)
+            heights.append(h)
+        except Exception as e:
+            print(f"Ошибка трансформации: {e}")
+    if not transformed_tensors:
+        return None
+    batch_tensor = torch.stack(transformed_tensors).to(device, dtype)
+    if batch_tensor.ndim==4:
+        batch_tensor = batch_tensor.unsqueeze(2)
+    with torch.no_grad():
+        posteriors = vae.encode(batch_tensor).latent_dist.mode()
+        if mean is not None and std is not None:
+            posteriors = (posteriors - latents_mean) / latents_std
+        posteriors = (posteriors - shift_factor) / scaling_factor
+    #latents_np = posteriors.cpu().numpy()
+    latents_np = posteriors.squeeze(2).cpu().numpy()
+    text_labels = [clean_label(text) for text in texts]
+    _, text_labels = process_labels_for_guidance(text_labels, empty_share)
+    return {
+        "vae": latents_np,
+        "text": text_labels,
+        "width": widths,
+        "height": heights
+    }
+# ---------------- 5️⃣ Обработка папки ----------------
+def process_folder(folder_path, limit=None):
+    image_paths, text_paths, width, height = [], [], [], []
+    transform = get_image_transform(min_size, max_size, step)
+    for root, _, files in os.walk(folder_path):
+        for filename in files:
+            if filename.lower().endswith((".jpg",".jpeg",".png")):
+                image_path = os.path.join(root, filename)
+                try:
+                    img = Image.open(image_path)
+                except:
+                    continue
+                w,h = transform(img, dry_run=True)
+                text_path = os.path.splitext(image_path)[0]+".txt"
+                if os.path.exists(text_path):
+                    image_paths.append(image_path)
+                    text_paths.append(text_path)
+                    width.append(w)
+                    height.append(h)
+    print(f"Найдено {len(image_paths)} изображений")
+    return image_paths, text_paths, width, height
+def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000, batch_size=1):
+    total_files = len(image_paths)
+    start_time = time.time()
+    for chunk_idx, start in enumerate(range(0,total_files,chunk_size),1):
+        end = min(start+chunk_size,total_files)
+        chunk_image_paths = image_paths[start:end]
+        chunk_text_paths = text_paths[start:end]
+        chunk_widths = width[start:end]
+        chunk_heights = height[start:end]
+        chunk_texts = []
+        for text_path in chunk_text_paths:
+            try:
+                with open(text_path,'r',encoding='utf-8') as f:
+                    chunk_texts.append(f.read().strip())
+            except:
+                chunk_texts.append("")
+        size_groups = {}
+        for i in range(len(chunk_image_paths)):
+            key=(chunk_widths[i],chunk_heights[i])
+            size_groups.setdefault(key,{"image_paths":[],"texts":[]})
+            size_groups[key]["image_paths"].append(chunk_image_paths[i])
+            size_groups[key]["texts"].append(chunk_texts[i])
+        for size_key,group_data in size_groups.items():
+            group_dataset = Dataset.from_dict(group_data)
+            processed_group = group_dataset.map(
+                lambda ex: encode_to_latents(
+                    [Image.open(p) for p in ex["image_paths"]],
+                    #[Image.open(p).convert("RGB") for p in ex["image_paths"]], # <--- Добавил .convert("RGB"), чтобы картинка загрузилась в память
+                    ex["texts"]
+                ),
+                batched=True,
+                batch_size=batch_size,
+            )
+            # --- NEW: уникальный путь ---
+            group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_{size_key[0]}x{size_key[1]}_proc_{process_index}_"
+            # --- END NEW ---
+            processed_group.save_to_disk(group_save_path)
+            clear_cuda_memory()
+# ---------------- 7️⃣ Объединение ----------------
+def combine_chunks(temp_path, final_path):
+    chunks = sorted([
+        os.path.join(temp_path,d)
+        for d in os.listdir(temp_path)
+        if "chunk_" in d
+    ])
+    datasets = [load_from_disk(c) for c in chunks]
+    combined = concatenate_datasets(datasets)
+    combined.save_to_disk(final_path)
+    print("✅ Сохранено")
+# ---------------- MAIN ----------------
+temp_path = f"{save_path}_temp"
+os.makedirs(temp_path, exist_ok=True)
+image_paths, text_paths, width, height = process_folder(folder_path,limit)
+# сортировка
+sorted_indices = sorted(range(len(width)), key=lambda i:(width[i],height[i]))
+image_paths = [image_paths[i] for i in sorted_indices]
+text_paths = [text_paths[i] for i in sorted_indices]
+width = [width[i] for i in sorted_indices]
+height = [height[i] for i in sorted_indices]
+# --- shard по GPU ---
+indices = list(range(len(image_paths)))
+indices = indices[process_index::num_processes]
+image_paths = [image_paths[i] for i in indices]
+text_paths = [text_paths[i] for i in indices]
+width = [width[i] for i in indices]
+height = [height[i] for i in indices]
+print(f"[GPU {process_index}] обрабатывает {len(image_paths)} файлов")
+process_in_chunks(image_paths, text_paths, width, height, chunk_size=5000, batch_size=batch_size)
+accelerator.wait_for_everyone()
+# --- NEW: только главный процесс ---
+if is_main_process:
+    #try:
+        #shutil.rmtree(folder_path)
+    #except:
+    #    pass
+    combine_chunks(temp_path, save_path)
+    try:
+        shutil.rmtree(temp_path)
+    except:
+        pass

dataset_sample.ipynb ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9c312df2-cb57-44f6-af54-3af6ab8f962f",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'numpy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#from datasets import load_from_disk\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mPIL\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Image\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_from_disk\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from PIL import Image\n",
+    "from collections import defaultdict\n",
+    "from diffusers import AutoencoderKLQwenImage\n",
+    "import gc\n",
+    "\n",
+    "def analyze_dataset_by_size(dataset_path):\n",
+    "    \"\"\"\n",
+    "    Группирует датасет по размерам изображений и выводит базовую информацию.\n",
+    "    \"\"\"\n",
+    "    # Настройка устройства и типа данных\n",
+    "    dtype = torch.float16\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    \n",
+    "    # Загрузка VAE модели\n",
+    "    print(\"Загрузка VAE модели...\")\n",
+    "    vae = AutoencoderKLQwenImage.from_pretrained(\"vae\",torch_dtype=dtype).to(device).eval()\n",
+    "    shift_factor = getattr(vae.config, \"shift_factor\", 0.0)\n",
+    "    if shift_factor is None:\n",
+    "        shift_factor = 0.0\n",
+    "    \n",
+    "    scaling_factor = getattr(vae.config, \"scaling_factor\", 1.0)\n",
+    "    if scaling_factor is None:\n",
+    "        scaling_factor = 1.0\n",
+    "        \n",
+    "    mean = getattr(vae.config, \"latents_mean\", None)\n",
+    "    std = getattr(vae.config, \"latents_std\", None)\n",
+    "    if mean is not None and std is not None:\n",
+    "        latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1)\n",
+    "        latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1)\n",
+    "    \n",
+    "    # Загружаем датасет\n",
+    "    print(f\"Загрузка датасета из {dataset_path}...\")\n",
+    "    dataset = load_from_disk(dataset_path)\n",
+    "\n",
+    "    print(f\"Осталось примеров после фильтрации: {len(dataset)}\")\n",
+    "    \n",
+    "    # Группируем примеры по размерам\n",
+    "    print(\"\\nГруппировка примеров по размерам...\")\n",
+    "    size_to_indices = defaultdict(list)\n",
+    "    \n",
+    "    # Собираем примеры с одинаковыми размерами\n",
+    "    # Собираем примеры с одинаковыми размерами (оптимизированная версия)\n",
+    "    widths = dataset[\"width\"]\n",
+    "    heights = dataset[\"height\"]\n",
+    "    for i, (w, h) in enumerate(zip(widths, heights)):\n",
+    "        size_to_indices[(w, h)].append(i)\n",
+    "    \n",
+    "    # Сортируем размеры по количеству примеров\n",
+    "    print(\"\\nСортируем...\")\n",
+    "    size_stats = [(size, len(indices)) for size, indices in size_to_indices.items()]\n",
+    "    size_stats.sort(key=lambda x: x[1], reverse=True)\n",
+    "    \n",
+    "    # Выводим информацию о каждой группе и показываем первый пример\n",
+    "    for size, count in size_stats:\n",
+    "        width, height = size\n",
+    "        first_idx = size_to_indices[size][1]\n",
+    "        example = dataset[first_idx]\n",
+    "        \n",
+    "        print(f\"\\n--- Батч {width}x{height}: {count} примеров ---\")\n",
+    "        \n",
+    "        # Декодируем латентное представление для первого примера\n",
+    "        latent = torch.tensor(example[\"vae\"], dtype=dtype).unsqueeze(0).to(device)\n",
+    "        \n",
+    "        # 1. Снова обманываем VAE, превращая картинку в \"видео из 1 кадра\" [B, C, 1, H, W]\n",
+    "        if latent.ndim == 4:\n",
+    "            latent = latent.unsqueeze(2)\n",
+    "        \n",
+    "        with torch.no_grad():\n",
+    "            if latents_mean is not None and latents_std is not None:\n",
+    "                latent = latent * latents_std + latents_mean\n",
+    "            \n",
+    "            print(f\"Min of latent_for_vae: {latent.min()}\")\n",
+    "            print(f\"Max of latent_for_vae: {latent.max()}\")\n",
+    "            print(f\"Mean of latent_for_vae: {latent.mean()}\")\n",
+    "            print(f\"Std: {latent.std().item():.4f}\")\n",
+    "            if torch.isnan(latent).any() or torch.isinf(latent).any():\n",
+    "                print(\"WARNING: Raw latents contain NaN or Inf values!\")\n",
+    "            \n",
+    "            reconstructed_image = vae.decode(latent).sample\n",
+    "        \n",
+    "        # 2. Вытаскиваем обычную 3D-картинку [C, H, W] из 5D-видеотензора\n",
+    "        if reconstructed_image.ndim == 5:\n",
+    "            # Берем нулевой батч, все каналы, нулевой кадр, всю высоту и ширину\n",
+    "            img_tensor = reconstructed_image[0, :, 0, :, :] \n",
+    "        else:\n",
+    "            img_tensor = reconstructed_image.squeeze(0) # На всякий случай, если VAE вернул 4D\n",
+    "        \n",
+    "        img_array = img_tensor.cpu().numpy()\n",
+    "        img_array = np.transpose(img_array, (1, 2, 0))\n",
+    "        img_array = (img_array + 1) / 2  # Нормализация к [0, 1]\n",
+    "        img_array = np.clip(img_array * 255, 0, 255).astype(np.uint8)  # Преобразуем в uint8 для PIL\n",
+    "        \n",
+    "        # Создаем PIL изображение из массива\n",
+    "        pil_image = Image.fromarray(img_array)\n",
+    "        print(f\"Текст: {example['text']}\")\n",
+    "        print(f\"Ключи: {', '.join(example.keys())}\")\n",
+    "        print(f\"latent: {latent.shape}\")\n",
+    "        pil_image.save(\"1.jpg\")\n",
+    "    \n",
+    "    # Очистка памяти\n",
+    "    if torch.cuda.is_available():\n",
+    "        torch.cuda.empty_cache()\n",
+    "        gc.collect()\n",
+    "    \n",
+    "    return size_to_indices  # Возвращаем словарь с индексами по группам\n",
+    "\n",
+    "# Использование\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Путь к датасету\n",
+    "    save_path = \"datasets/ds234_640_vae_qwen\"\n",
+    "    \n",
+    "    # Анализ датасета\n",
+    "    size_groups = analyze_dataset_by_size(save_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74a5d11d-369f-4f25-9ee0-31d3bccd0254",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": ["pipeline_sdxs", "SdxsPipeline"],
+  "_diffusers_version": "0.36.0",
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "Qwen3_5ForConditionalGeneration"
+  ],
+  "tokenizer": [
+    "transformers",
+    "Qwen3_5Tokenizer"
+  ],
+  "transformer": [
+    "diffusers",
+    "CosmosTransformer3DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLQwenImage"
+  ]
+}

pipeline_sdxs.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import torch
+import numpy as np
+from PIL import Image
+from typing import List, Union, Optional, Tuple
+from dataclasses import dataclass
+from diffusers import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from tqdm import tqdm
+@dataclass
+class SdxsPipelineOutput(BaseOutput):
+    images: Union[List[Image.Image], np.ndarray]
+    prompt: Optional[Union[str, List[str]]] = None
+class SdxsPipeline(DiffusionPipeline):
+    # Cosmos требует 512 токенов
+    MAX_TEXT_TOKENS = 512
+    def __init__(self, vae, text_encoder, tokenizer, transformer, scheduler):
+        super().__init__()
+        # Регистрируем модули (с Qwen)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler
+        )
+        self.vae_scale_factor = getattr(self.vae.config, "spatial_compression_ratio", 8)
+        if hasattr(self.vae.config, "block_out_channels"):
+            self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        # Загружаем mean и std для VAE (Cosmos-style)
+        mean = getattr(self.vae.config, "latents_mean", None)
+        std = getattr(self.vae.config, "latents_std", None)
+        if mean is not None and std is not None:
+            self.vae_latents_mean = torch.tensor(mean).view(1, len(mean), 1, 1, 1)
+            # Внимание: Cosmos использует инвертированный std для декодирования (1.0 / std)
+            self.vae_latents_std =  torch.tensor(std).view(1, len(std), 1, 1, 1)
+        else:
+            self.vae_latents_mean = None
+            self.vae_latents_std = None
+        # Регистрируем параметры Cosmos в шедулере (если они еще не там)
+        if self.scheduler is not None:
+            self.scheduler.register_to_config(
+                sigma_max=getattr(self.scheduler.config, "sigma_max", 80.0),
+                sigma_min=getattr(self.scheduler.config, "sigma_min", 0.002),
+                sigma_data=getattr(self.scheduler.config, "sigma_data", 1.0),
+                final_sigmas_type=getattr(self.scheduler.config, "final_sigmas_type", "sigma_min"),
+            )
+    @staticmethod
+    def _pad_tensor_to_length(tensor: torch.Tensor, target_len: int, dim: int = 1, pad_value: float = 0) -> torch.Tensor:
+        current_len = tensor.shape[dim]
+        if current_len >= target_len:
+            return tensor
+        pad_size = target_len - current_len
+        if tensor.dim() == 3:
+            padding = (0, 0, 0, pad_size, 0, 0)
+        elif tensor.dim() == 2:
+            padding = (0, pad_size, 0, 0)
+        else:
+            raise ValueError(f"Unsupported tensor dimension: {tensor.dim()}")
+        return torch.nn.functional.pad(tensor, padding, value=pad_value)
+    @torch.no_grad()
+    def refine_prompts(
+        self,
+        prompts: Union[str, List[str]],
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7
+    ) -> List[str]:
+        """Refines a list of prompts using the Text Encoder (LLM)."""
+        device = self.device
+        if system_prompt is None:
+            system_prompt = (
+                "You are a skilled text-to-image prompt engineer whose sole function is to transform "
+                "the user's input into an aesthetically optimized, detailed, and visually descriptive two-sentence output. "
+                "**The primary subject MUST be the main focus of the revised prompt "
+                "and MUST be described in rich detail within the first sentence.** "
+                "Output **only** the final revised prompt, with absolutely no commentary. "
+                "Don't use cliches like warm, soft, vibrant, wildflowers. Be creative. User input prompt: "
+            )
+        pad_id = getattr(self.text_encoder.config, "pad_token_id", None) or \
+                 getattr(self.text_encoder.config, "eos_token_id", None)
+        prompts_list = [prompts] if isinstance(prompts, str) else prompts
+        refined_list = []
+        for p in prompts_list:
+            full_text = system_prompt + p
+            messages = [{"role": "user", "content": [{"type": "text", "text": full_text}]}]
+            inputs = self.tokenizer.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+            ).to(device)
+            generated_ids = self.text_encoder.generate(
+                **inputs,
+                max_new_tokens=self.MAX_TEXT_TOKENS,
+                do_sample=True,
+                temperature=temperature,
+                pad_token_id=pad_id
+            )
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            output_text = self.tokenizer.batch_decode(
+                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+            refined_list.append(output_text[0])
+        return refined_list
+    @torch.no_grad()
+    def encode_text(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Qwen-specific text encoding (using chat_template and hidden_states[-2])"""
+        device = self.device
+        dtype = self.transformer.dtype
+        if text is None: text = ""
+        if isinstance(text, str): text = [text]
+        formatted_prompts = []
+        for t in text:
+            messages = [{"role": "user", "content": [{"type": "text", "text": t}]}]
+            formatted_prompts.append(self.tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False))
+        toks = self.tokenizer(formatted_prompts, padding="max_length", max_length=self.MAX_TEXT_TOKENS, truncation=True, return_tensors="pt").to(device)
+        outputs = self.text_encoder(input_ids=toks.input_ids, attention_mask=toks.attention_mask, output_hidden_states=True)
+        # Берем предпоследний слой эмбеддингов, как того требуют современные пайплайны
+        last_hidden = outputs.hidden_states[-2]
+        return last_hidden.to(dtype=dtype), toks.attention_mask.to(dtype=torch.int64)
+    @torch.no_grad()
+    def image_upscale(self, image: Union[str, Image.Image, List[Union[str, Image.Image]]], batch_size: int = 1) -> List[Image.Image]:
+        images = [image] if isinstance(image, (str, Image.Image)) else image
+        batch_data = []
+        for img in images:
+            if isinstance(img, str): img = Image.open(img)
+            if img.mode == "RGBA":
+                img = Image.alpha_composite(Image.new("RGBA", img.size, (255, 255, 255)), img)
+            img = img.convert("RGB")
+            w, h = img.size
+            pw, ph = (8 - w % 8) % 8, (8 - h % 8) % 8
+            if pw or ph:
+                padded = Image.new("RGB", (w + pw, h + ph), (255, 255, 255))
+                padded.paste(img)
+                img = padded
+            t = torch.from_numpy(np.array(img).astype(np.float32) / 127.5 - 1.0).permute(2, 0, 1)
+            batch_data.append((t.to(self.device, torch.float16), w, h))
+        unique_shapes = {t.shape for t, _, _ in batch_data}
+        step = batch_size if len(unique_shapes) == 1 else 1
+        output_images = []
+        for i in range(0, len(batch_data), step):
+            chunk = batch_data[i : i + step]
+            tensors = torch.stack([c[0] for c in chunk]).unsqueeze(2)
+            latents = self.vae.encode(tensors).latent_dist.mean
+            decoded = self.vae.decode(latents.to(self.vae.dtype))[0]
+            if decoded.ndim == 5:
+                decoded = decoded.squeeze(2)
+            decoded = (decoded.clamp(-1, 1) + 1) / 2
+            for j, tensor in enumerate(decoded):
+                w, h = chunk[j][1], chunk[j][2]
+                arr = tensor.cpu().permute(1, 2, 0).float().numpy()
+                arr = arr[:h * 2, :w * 2]
+                output_images.append(Image.fromarray((arr * 255).astype("uint8")))
+        return output_images
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        latents: Optional[torch.Tensor] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 40,
+        guidance_scale: float = 4.0,
+        generator: Optional[torch.Generator] = None,
+        seed: Optional[int] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        device = self.device
+        dtype = self.transformer.dtype
+        if generator is None and seed is not None:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 1. Encode Positive
+        if prompt_embeds is None:
+            if prompt is None: raise ValueError("`prompt` or `prompt_embeds` required.")
+            prompt_embeds, prompt_attention_mask = self.encode_text(prompt)
+        prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
+        prompt_attention_mask = prompt_attention_mask.to(device=device, dtype=torch.int64)
+        batch_size = prompt_embeds.shape[0]
+        # 2. Encode Negative
+        if do_classifier_free_guidance:
+            if negative_prompt_embeds is None:
+                neg_text = negative_prompt if negative_prompt is not None else ("" if isinstance(prompt, str) else [""] * len(prompt))
+                negative_prompt_embeds, negative_prompt_attention_mask = self.encode_text(neg_text)
+            negative_prompt_embeds = negative_prompt_embeds.to(device=device, dtype=dtype)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device=device, dtype=torch.int64)
+            if negative_prompt_embeds.shape[0] != batch_size:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1)
+                negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(batch_size, 1)
+            max_len = max(prompt_embeds.shape[1], negative_prompt_embeds.shape[1])
+            prompt_embeds = self._pad_tensor_to_length(prompt_embeds, max_len, dim=1, pad_value=0)
+            negative_prompt_embeds = self._pad_tensor_to_length(negative_prompt_embeds, max_len, dim=1, pad_value=0)
+            prompt_attention_mask = self._pad_tensor_to_length(prompt_attention_mask, max_len, dim=1, pad_value=0)
+            negative_prompt_attention_mask = self._pad_tensor_to_length(negative_prompt_attention_mask, max_len, dim=1, pad_value=0)
+            text_embeddings = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        else:
+            text_embeddings = prompt_embeds
+        # 3. Prepare Timesteps (Cosmos specific schedule)
+        sigmas_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        sigmas = torch.linspace(0, 1, num_inference_steps, dtype=sigmas_dtype)
+        self.scheduler.set_timesteps(sigmas=sigmas, device=device)
+        timesteps = self.scheduler.timesteps
+        # Защита от деления на ноль на последнем шаге
+        if self.scheduler.config.get("final_sigmas_type", "zero") == "sigma_min":
+            self.scheduler.sigmas[-1] = self.scheduler.sigmas[-2]
+        if self.scheduler.sigmas[-1] == 0.0:
+            self.scheduler.sigmas[-1] = 1e-4
+        # 4. Prepare Latents (Noise)
+        latent_h = height // self.vae_scale_factor
+        latent_w = width // self.vae_scale_factor
+        in_channels = self.transformer.config.in_channels
+        sigma_max = getattr(self.scheduler.config, "sigma_max", 80.0)
+        if latents is None:
+            # Создаем 5D тензор [Batch, Channels, Frames, Height, Width]
+            latents = torch.randn((batch_size, in_channels, 1, latent_h, latent_w), generator=generator, device=device, dtype=dtype)
+            latents = latents * sigma_max
+        else:
+            latents = latents.to(device=device, dtype=dtype) * sigma_max
+        # Cosmos Padding Mask
+        padding_mask = torch.zeros((1, 1, height, width), device=device, dtype=dtype)
+        # 5. Denoising Loop (Continuous Flow Math)
+        for i, t in enumerate(tqdm(timesteps, desc="Sampling")):
+            current_sigma = self.scheduler.sigmas[i]
+            # Защита от деления на 0 при вычислении current_t
+            if current_sigma == 0.0:
+                current_sigma = torch.tensor(1e-4, dtype=current_sigma.dtype, device=device)
+            current_t = current_sigma / (current_sigma + 1.0)
+            c_in = 1.0 - current_t
+            c_skip = 1.0 - current_t
+            c_out = -current_t
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = (latent_model_input * c_in).to(dtype)
+            # Трансформер ждет timestep в виде 1D тензора [B]
+            t_val = float(current_t.item()) if torch.is_tensor(current_t) else float(current_t)
+            timestep_tensor = torch.tensor(
+                [t_val],
+                device=device,
+                dtype=dtype
+            ).view(1, 1, 1, 1, 1).expand(latent_model_input.shape[0], 1, 1, 1, 1)
+            model_out = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep_tensor,
+                encoder_hidden_states=text_embeddings,
+                padding_mask=padding_mask,
+                return_dict=False,
+            )[0]
+            batched_latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            noise_pred = (c_skip * batched_latents + c_out * model_out.float()).to(dtype)
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            noise_pred = (latents - noise_pred) / current_sigma
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+        # 6. Decode
+        if output_type == "latent":
+            if not return_dict: return (latents, prompt)
+            return SdxsPipelineOutput(images=latents)
+        if getattr(self.vae.config, "latents_std", None) is not None and getattr(self.vae.config, "latents_mean", None) is not None:
+            sigma_data = getattr(self.scheduler.config, "sigma_data", 1.0)
+            l_mean = torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(device, dtype)
+            l_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(device, dtype)
+            # Оригинальная формула: делим на инвертированный std (что равноценно умножению на std)
+            #latents_std_inv = 1.0 / l_std
+            latents = latents * l_std + l_mean
+        image_output = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
+        if image_output.ndim == 5:
+            image_output = image_output.squeeze(2)
+        image_output = (image_output.clamp(-1, 1) + 1) / 2
+        image_np = image_output.cpu().permute(0, 2, 3, 1).float().numpy()
+        # На всякий случай вычищаем NaNs
+        image_np = np.nan_to_num(image_np, nan=0.0, posinf=1.0, neginf=0.0)
+        if output_type == "pil":
+            images = [(Image.fromarray((img * 255).round().astype("uint8"))) for img in image_np]
+        else:
+            images = image_np
+        if not return_dict:
+            return (images,)
+        return SdxsPipelineOutput(images=images)

pipeline_sdxs_t5.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import torch
+import numpy as np
+from PIL import Image
+from typing import List, Union, Optional, Tuple
+from dataclasses import dataclass
+from diffusers import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from tqdm import tqdm
+@dataclass
+class SdxsPipelineOutput(BaseOutput):
+    images: Union[List[Image.Image], np.ndarray]
+    prompt: Optional[Union[str, List[str]]] = None
+class SdxsPipeline(DiffusionPipeline):
+    # Cosmos требует 512 токенов
+    MAX_TEXT_TOKENS = 512
+    def __init__(self, vae, text_encoder, tokenizer, transformer, scheduler):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler
+        )
+        self.vae_scale_factor = getattr(self.vae.config, "spatial_compression_ratio", 8)
+        if hasattr(self.vae.config, "block_out_channels"):
+            self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        # Регистрируем параметры Cosmos в шедулере
+        if self.scheduler is not None:
+            self.scheduler.register_to_config(
+                sigma_max=getattr(self.scheduler.config, "sigma_max", 80.0),
+                sigma_min=getattr(self.scheduler.config, "sigma_min", 0.002),
+                sigma_data=getattr(self.scheduler.config, "sigma_data", 1.0),
+                final_sigmas_type=getattr(self.scheduler.config, "final_sigmas_type", "sigma_min"),
+            )
+    @staticmethod
+    def _pad_tensor_to_length(tensor: torch.Tensor, target_len: int, dim: int = 1, pad_value: float = 0) -> torch.Tensor:
+        current_len = tensor.shape[dim]
+        if current_len >= target_len:
+            return tensor
+        pad_size = target_len - current_len
+        if tensor.dim() == 3:
+            padding = (0, 0, 0, pad_size, 0, 0)
+        elif tensor.dim() == 2:
+            padding = (0, pad_size, 0, 0)
+        else:
+            raise ValueError(f"Unsupported tensor dimension: {tensor.dim()}")
+        return torch.nn.functional.pad(tensor, padding, value=pad_value)
+    @torch.no_grad()
+    def refine_prompts(
+        self,
+        prompts: Union[str, List[str]],
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7
+    ) -> List[str]:
+        return [prompts] if isinstance(prompts, str) else prompts
+    @torch.no_grad()
+    def encode_text(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
+        device = self.device
+        dtype = self.transformer.dtype
+        if text is None: text = ""
+        if isinstance(text, str): text = [text]
+        text_inputs = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.MAX_TEXT_TOKENS,
+            truncation=True,
+            return_tensors="pt"
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        attention_mask = text_inputs.attention_mask.to(device)
+        outputs = self.text_encoder(input_ids=text_input_ids, attention_mask=attention_mask)
+        prompt_embeds = outputs.last_hidden_state
+        lengths = attention_mask.sum(dim=1)
+        for i, length in enumerate(lengths):
+            prompt_embeds[i, length:] = 0
+        return prompt_embeds.to(dtype=dtype), attention_mask.to(dtype=torch.int64)
+    @torch.no_grad()
+    def image_upscale(self, image: Union[str, Image.Image, List[Union[str, Image.Image]]], batch_size: int = 1) -> List[Image.Image]:
+        images = [image] if isinstance(image, (str, Image.Image)) else image
+        batch_data = []
+        for img in images:
+            if isinstance(img, str): img = Image.open(img)
+            if img.mode == "RGBA":
+                img = Image.alpha_composite(Image.new("RGBA", img.size, (255, 255, 255)), img)
+            img = img.convert("RGB")
+            w, h = img.size
+            pw, ph = (8 - w % 8) % 8, (8 - h % 8) % 8
+            if pw or ph:
+                padded = Image.new("RGB", (w + pw, h + ph), (255, 255, 255))
+                padded.paste(img)
+                img = padded
+            t = torch.from_numpy(np.array(img).astype(np.float32) / 127.5 - 1.0).permute(2, 0, 1)
+            batch_data.append((t.to(self.device, torch.float16), w, h))
+        unique_shapes = {t.shape for t, _, _ in batch_data}
+        step = batch_size if len(unique_shapes) == 1 else 1
+        output_images = []
+        for i in range(0, len(batch_data), step):
+            chunk = batch_data[i : i + step]
+            tensors = torch.stack([c[0] for c in chunk]).unsqueeze(2)
+            latents = self.vae.encode(tensors).latent_dist.mean
+            decoded = self.vae.decode(latents.to(self.vae.dtype))[0]
+            if decoded.ndim == 5:
+                decoded = decoded.squeeze(2)
+            decoded = (decoded.clamp(-1, 1) + 1) / 2
+            for j, tensor in enumerate(decoded):
+                w, h = chunk[j][1], chunk[j][2]
+                arr = tensor.cpu().permute(1, 2, 0).float().numpy()
+                arr = arr[:h * 2, :w * 2]
+                output_images.append(Image.fromarray((arr * 255).astype("uint8")))
+        return output_images
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        latents: Optional[torch.Tensor] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 40,
+        guidance_scale: float = 7.0,
+        generator: Optional[torch.Generator] = None,
+        seed: Optional[int] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        device = self.device
+        dtype = self.transformer.dtype
+        if generator is None and seed is not None:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 1. Encode Positive
+        if prompt_embeds is None:
+            if prompt is None: raise ValueError("`prompt` or `prompt_embeds` required.")
+            prompt_embeds, _ = self.encode_text(prompt)
+        prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
+        batch_size = prompt_embeds.shape[0]
+        # 2. Encode Negative
+        if do_classifier_free_guidance:
+            if negative_prompt_embeds is None:
+                neg_text = negative_prompt if negative_prompt is not None else ("" if isinstance(prompt, str) else [""] * len(prompt))
+                negative_prompt_embeds, _ = self.encode_text(neg_text)
+            negative_prompt_embeds = negative_prompt_embeds.to(device=device, dtype=dtype)
+            if negative_prompt_embeds.shape[0] != batch_size:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1)
+            max_len = max(prompt_embeds.shape[1], negative_prompt_embeds.shape[1])
+            prompt_embeds = self._pad_tensor_to_length(prompt_embeds, max_len, dim=1, pad_value=0)
+            negative_prompt_embeds = self._pad_tensor_to_length(negative_prompt_embeds, max_len, dim=1, pad_value=0)
+        # 3. Prepare Timesteps
+        sigmas_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        sigmas = torch.linspace(0, 1, num_inference_steps, dtype=sigmas_dtype)
+        self.scheduler.set_timesteps(sigmas=sigmas, device=device)
+        timesteps = self.scheduler.timesteps
+        # Защита от деления на ноль на последнем шаге
+        if self.scheduler.config.get("final_sigmas_type", "zero") == "sigma_min":
+            self.scheduler.sigmas[-1] = self.scheduler.sigmas[-2]
+        if self.scheduler.sigmas[-1] == 0.0:
+            self.scheduler.sigmas[-1] = 1e-4
+        # 4. Prepare Latents (Noise)
+        latent_h = height // self.vae_scale_factor
+        latent_w = width // self.vae_scale_factor
+        in_channels = self.transformer.config.in_channels
+        sigma_max = getattr(self.scheduler.config, "sigma_max", 80.0)
+        if latents is None:
+            latents = torch.randn((batch_size, in_channels, 1, latent_h, latent_w), generator=generator, device=device, dtype=dtype)
+            latents = latents * sigma_max
+        else:
+            latents = latents.to(device=device, dtype=dtype) * sigma_max
+        # Cosmos Padding Mask
+        padding_mask = latents.new_zeros(1, 1, height, width, dtype=dtype)
+        # 5. Denoising Loop
+        for i, t in enumerate(tqdm(timesteps, desc="Sampling")):
+            current_sigma = self.scheduler.sigmas[i]
+            # Защита от деления на 0 при вычислении current_t
+            if current_sigma == 0.0:
+                current_sigma = torch.tensor(1e-4, dtype=current_sigma.dtype, device=device)
+            current_t = current_sigma / (current_sigma + 1.0)
+            c_in = 1.0 - current_t
+            c_skip = 1.0 - current_t
+            c_out = -current_t
+            latent_model_input = (latents * c_in).to(dtype)
+            timestep = current_t.expand(latents.shape[0]).to(dtype)
+            # Проход 1
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                padding_mask=padding_mask,
+                return_dict=False,
+            )[0]
+            noise_pred = (c_skip * latents + c_out * noise_pred.float()).to(dtype)
+            # Проход 2
+            if do_classifier_free_guidance:
+                noise_pred_uncond = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=negative_prompt_embeds,
+                    padding_mask=padding_mask,
+                    return_dict=False,
+                )[0]
+                noise_pred_uncond = (c_skip * latents + c_out * noise_pred_uncond.float()).to(dtype)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+            noise_pred = (latents - noise_pred) / current_sigma
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+        # 6. Decode
+        if output_type == "latent":
+            if not return_dict: return (latents, prompt)
+            return SdxsPipelineOutput(images=latents)
+        # Точная математика NVIDIA для декодирования (без двойных инверсий)
+        if getattr(self.vae.config, "latents_std", None) is not None and getattr(self.vae.config, "latents_mean", None) is not None:
+            sigma_data = getattr(self.scheduler.config, "sigma_data", 1.0)
+            l_mean = torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(device, dtype)
+            l_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(device, dtype)
+            # Оригинальная формула: делим на инвертированный std (что равноценно умножению на std)
+            latents_std_inv = 1.0 / l_std
+            latents = latents / latents_std_inv / sigma_data + l_mean
+        image_output = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
+        if image_output.ndim == 5:
+            image_output = image_output.squeeze(2)
+        image_output = (image_output.clamp(-1, 1) + 1) / 2
+        image_np = image_output.cpu().permute(0, 2, 3, 1).float().numpy()
+        # На всякий случай вычищаем NaNs, если они проскользнули, чтобы скрипт не падал с кастом
+        image_np = np.nan_to_num(image_np, nan=0.0, posinf=1.0, neginf=0.0)
+        if output_type == "pil":
+            images = [(Image.fromarray((img * 255).round().astype("uint8"))) for img in image_np]
+        else:
+            images = image_np
+        if not return_dict:
+            return (images,)
+        return SdxsPipelineOutput(images=images)

scheduler/.ipynb_checkpoints/scheduler_config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.34.0.dev0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "final_sigmas_type": "sigma_min",
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "shift_terminal": null,
+  "sigma_data": 1.0,
+  "sigma_max": 80.0,
+  "sigma_min": 0.002,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": true
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.34.0.dev0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "final_sigmas_type": "sigma_min",
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "shift_terminal": null,
+  "sigma_data": 1.0,
+  "sigma_max": 80.0,
+  "sigma_min": 0.002,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": true
+}

t.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from datasets import load_from_disk
+import numpy as np
+import torch
+from PIL import Image
+from collections import defaultdict
+from diffusers import AutoencoderKLQwenImage
+import gc
+def analyze_dataset_by_size(dataset_path):
+    """
+    Группирует датасет по размерам изображений и выводит базовую информацию.
+    """
+    # Настройка устройства и типа данных
+    dtype = torch.float32
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Загрузка VAE модели
+    print("Загрузка VAE модели...")
+    vae = AutoencoderKLQwenImage.from_pretrained("vae",torch_dtype=dtype).to(device).eval()
+    shift_factor = getattr(vae.config, "shift_factor", 0.0)
+    if shift_factor is None:
+        shift_factor = 0.0
+    scaling_factor = getattr(vae.config, "scaling_factor", 1.0)
+    if scaling_factor is None:
+        scaling_factor = 1.0
+    mean = getattr(vae.config, "latents_mean", None)
+    std = getattr(vae.config, "latents_std", None)
+    if mean is not None and std is not None:
+        latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1)
+        latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1)
+    # Загружаем датасет
+    print(f"Загрузка датасета из {dataset_path}...")
+    dataset = load_from_disk(dataset_path)
+    print(f"Осталось примеров после фильтрации: {len(dataset)}")
+    # Группируем примеры по размерам
+    print("\nГруппировка примеров по размерам...")
+    size_to_indices = defaultdict(list)
+    # Собираем примеры с одинаковыми размерами
+    # Собираем примеры с одинаковыми размерами (оптимизированная версия)
+    widths = dataset["width"]
+    heights = dataset["height"]
+    for i, (w, h) in enumerate(zip(widths, heights)):
+        size_to_indices[(w, h)].append(i)
+    # Сортируем размеры по количеству примеров
+    print("\nСортируем...")
+    size_stats = [(size, len(indices)) for size, indices in size_to_indices.items()]
+    size_stats.sort(key=lambda x: x[1], reverse=True)
+    # Выводим информацию о каждой группе и показываем первый пример
+    for size, count in size_stats:
+        width, height = size
+        first_idx = size_to_indices[size][1]
+        example = dataset[first_idx]
+        print(f"\n--- Батч {width}x{height}: {count} примеров ---")
+        # Декодируем латентное представление для первого примера
+        latent = torch.tensor(example["vae"], dtype=dtype).unsqueeze(0).to(device)
+        # 1. Снова обманываем VAE, превращая картинку в "видео из 1 кадра" [B, C, 1, H, W]
+        if latent.ndim == 4:
+            latent = latent.unsqueeze(2)
+        with torch.no_grad():
+            if latents_mean is not None and latents_std is not None:
+                latent = latent * latents_std + latents_mean
+            print(f"Min of latent_for_vae: {latent.min()}")
+            print(f"Max of latent_for_vae: {latent.max()}")
+            print(f"Mean of latent_for_vae: {latent.mean()}")
+            print(f"Std: {latent.std().item():.4f}")
+            if torch.isnan(latent).any() or torch.isinf(latent).any():
+                print("WARNING: Raw latents contain NaN or Inf values!")
+            reconstructed_image = vae.decode(latent).sample
+        # 2. Вытаскиваем обычную 3D-картинку [C, H, W] из 5D-видеотензора
+        if reconstructed_image.ndim == 5:
+            # Берем нулевой батч, все каналы, нулевой кадр, всю высоту и ширину
+            img_tensor = reconstructed_image[0, :, 0, :, :]
+        else:
+            img_tensor = reconstructed_image.squeeze(0) # На всякий случай, если VAE вернул 4D
+        img_array = img_tensor.cpu().numpy()
+        img_array = np.transpose(img_array, (1, 2, 0))
+        img_array = (img_array + 1) / 2  # Нормализация к [0, 1]
+        img_array = np.clip(img_array * 255, 0, 255).astype(np.uint8)  # Преобразуем в uint8 для PIL
+        # Создаем PIL изображение из массива
+        pil_image = Image.fromarray(img_array)
+        print(f"Текст: {example['text']}")
+        print(f"Ключи: {', '.join(example.keys())}")
+        print(f"latent: {latent.shape}")
+        pil_image.save("1.jpg")
+    # Очистка памяти
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+    return size_to_indices  # Возвращаем словарь с индексами по группам
+# Использование
+if __name__ == "__main__":
+    # Путь к датасету
+    save_path = "datasets/ds234_640_vae_qwen"
+    # Анализ датасета
+    size_groups = analyze_dataset_by_size(save_path)

test.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:677906d20fb691440965fb107de2c9d8e9b7c75884d9e3e15b4375f4257df8ae
+size 21416092

text_encoder/.ipynb_checkpoints/config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+  "architectures": [
+    "Qwen3_5Model"
+  ],
+  "dtype": "bfloat16",
+  "image_token_id": 248056,
+  "model_type": "qwen3_5",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_output_gate": true,
+    "bos_token_id": null,
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "full_attention_interval": 4,
+    "head_dim": 256,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 3584,
+    "layer_types": [
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention"
+    ],
+    "linear_conv_kernel_dim": 4,
+    "linear_key_head_dim": 128,
+    "linear_num_key_heads": 16,
+    "linear_num_value_heads": 16,
+    "linear_value_head_dim": 128,
+    "mamba_ssm_dtype": "float32",
+    "max_position_embeddings": 262144,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_5_text",
+    "mtp_num_hidden_layers": 1,
+    "mtp_use_dedicated_embeddings": false,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 2,
+    "pad_token_id": null,
+    "partial_rotary_factor": 0.25,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        11,
+        11,
+        10
+      ],
+      "partial_rotary_factor": 0.25,
+      "rope_theta": 10000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 248320
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.1",
+  "video_token_id": 248057,
+  "vision_config": {
+    "deepstack_visual_indexes": [],
+    "depth": 12,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "model_type": "qwen3_5_vision",
+    "num_heads": 12,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 1024,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 248054,
+  "vision_start_token_id": 248053
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+  "architectures": [
+    "Qwen3_5Model"
+  ],
+  "dtype": "bfloat16",
+  "image_token_id": 248056,
+  "model_type": "qwen3_5",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_output_gate": true,
+    "bos_token_id": null,
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "full_attention_interval": 4,
+    "head_dim": 256,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 3584,
+    "layer_types": [
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention"
+    ],
+    "linear_conv_kernel_dim": 4,
+    "linear_key_head_dim": 128,
+    "linear_num_key_heads": 16,
+    "linear_num_value_heads": 16,
+    "linear_value_head_dim": 128,
+    "mamba_ssm_dtype": "float32",
+    "max_position_embeddings": 262144,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_5_text",
+    "mtp_num_hidden_layers": 1,
+    "mtp_use_dedicated_embeddings": false,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 2,
+    "pad_token_id": null,
+    "partial_rotary_factor": 0.25,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        11,
+        11,
+        10
+      ],
+      "partial_rotary_factor": 0.25,
+      "rope_theta": 10000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 248320
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.1",
+  "video_token_id": 248057,
+  "vision_config": {
+    "deepstack_visual_indexes": [],
+    "depth": 12,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "model_type": "qwen3_5_vision",
+    "num_heads": 12,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 1024,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 248054,
+  "vision_start_token_id": 248053
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be05a6e8dcacdae04865491110f227b71229110e321aa655982c4bd793ea411a
+size 1706027688

tokenizer/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,154 @@

+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- '<think>\n' }}
+    {%- else %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06b9509352d2af50381ab2247e083b80d32d5c0aba91c272ca9ff729b6a0e523
+size 19989325

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "image_token": "<|image_pad|>",
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 262144,
+  "model_specific_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "pad_token": "<|endoftext|>",
+  "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}

train-Copy1.py ADDED Viewed

	@@ -0,0 +1,924 @@

+import os
+import math
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import wandb, comet_ml
+import random, time
+import gc
+import bitsandbytes as bnb
+import torch.nn.functional as F
+import argparse
+from datetime import datetime
+from diffusers import CosmosTransformer3DModel, AutoencoderKLQwenImage, FlowMatchEulerDiscreteScheduler
+from transformers import Qwen3_5Tokenizer, Qwen3_5ForConditionalGeneration
+from torch.utils.data import DataLoader, Sampler
+from torch.optim.lr_scheduler import LambdaLR
+from collections import defaultdict
+from accelerate import Accelerator
+from datasets import load_from_disk
+from tqdm import tqdm
+from PIL import Image, ImageOps
+from torch.utils.checkpoint import checkpoint
+from diffusers.models.attention_processor import AttnProcessor2_0
+from contextlib import nullcontext
+from transformers.optimization import Adafactor
+# Muon not tested! pip install git+https://github.com/recoilme/muon_adamw8bit.git
+from muon_adamw8bit import MuonAdamW8bit
+os.environ["NCCL_P2P_DISABLE"] = "1"
+os.environ["NCCL_IB_DISABLE"] = "1" # comment this on H100!
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# --------------------------- Параметры ---------------------------
+ds_path = "datasets/ds234_640_vae_qwen"
+project = "transformer"
+gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+local_bs = max(1, int((gpu_mem_gb / 32) * 7))
+num_gpus = torch.cuda.device_count()
+batch_size = local_bs * num_gpus
+base_learning_rate = 4e-5
+min_learning_rate = 4e-6
+learning_rate_scale = 3
+base_learning_rate = base_learning_rate / learning_rate_scale
+min_learning_rate = min_learning_rate / learning_rate_scale
+print(f"Calculated params max-lr:{base_learning_rate} min-lr:{min_learning_rate} GPUs: {num_gpus}, Global BS: {batch_size}")
+num_epochs = num_gpus
+sink_interval_share = 10
+sample_interval_min = 20
+cfg_dropout = 0.10
+# Время t, bias = -0.5 (Фокус на Деталях ~300) bias = 0.5 (Фокус на структуре) bias = 0 (колокол/ равномерно)
+sigmoid_bias = 0.1
+max_length = 250
+use_precomputed_embeddings = False
+use_wandb = False
+use_comet_ml = False
+save_model = True
+use_decay = True
+fbp = False
+torch_compile = False
+transformer_gradient = True
+loss_normalize = False
+fixed_seed = False
+shuffle = True
+optimizer_type = "adafactor"
+if optimizer_type == "muon_adam8bit":
+    batch_size = num_gpus * max(1, int((gpu_mem_gb / 32) * 3))
+    muon_lr_scale = 500
+comet_ml_api_key = "Agctp26mbqnoYrrlvQuKSTk6r"
+comet_ml_workspace = "recoilme"
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cuda.enable_flash_sdp(True)
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+torch.backends.cuda.enable_math_sdp(False)
+save_barrier = 1.25
+warmup_percent = 0.0025
+betta2 = 0.997
+eps = 1e-6
+clip_grad_norm = 1.0
+limit = 0
+checkpoints_folder = ""
+gradient_accumulation_steps = 1
+dtype = torch.float32
+mixed_precision = "bf16"
+# Параметры для диффузии
+n_diffusion_steps = 40
+samples_to_generate = 12
+guidance_scale = 7.0
+# Папки для сохранения результатов
+generated_folder = "samples"
+os.makedirs(generated_folder, exist_ok=True)
+# Настройка seed
+current_date = datetime.now()
+seed = int(current_date.strftime("%Y%m%d")) + 42
+if fixed_seed:
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+accelerator = Accelerator(
+    mixed_precision=mixed_precision,
+    gradient_accumulation_steps=gradient_accumulation_steps
+)
+device = accelerator.device
+print("init")
+parser = argparse.ArgumentParser(description='Train a model on a dataset.')
+parser.add_argument('--ds-path', type=str, default=ds_path, help='Path to the dataset')
+parser.add_argument('--ep', type=int, default=num_epochs, help='Number of epochs to train the model')
+parser.add_argument('--batch', type=int, default=batch_size, help='Total batch size')
+parser.add_argument('--min-lr', type=float, default=min_learning_rate, help='Minimum learning rate')
+parser.add_argument('--max-lr', type=float, default=base_learning_rate, help='Maximum learning rate')
+parser.add_argument('--dry-run', action='store_true',default=False, help='Dry run train without saving/sampling')
+parser.add_argument('--lvl', type=float, default=0.0, help='Train level, from 0.5 to 5')
+args = parser.parse_args()
+batch_size = args.batch
+ds_path = args.ds_path
+base_learning_rate = args.max_lr
+min_learning_rate = args.min_lr
+num_epochs = args.ep
+lvl = args.lvl
+if args.dry_run:
+    save_model = False
+if lvl >= 0.1:
+    base_learning_rate = base_learning_rate / lvl
+    min_learning_rate = min_learning_rate / lvl
+    print(f"max-lr:{base_learning_rate} min-lr:{min_learning_rate}")
+# --------------------------- Инициализация WandB ---------------------------
+if accelerator.is_main_process:
+    if use_wandb:
+        wandb.init(project=project, config={
+            "batch_size": batch_size,
+            "base_learning_rate": base_learning_rate,
+            "num_epochs": num_epochs,
+            "optimizer_type": optimizer_type,
+        })
+    if use_comet_ml:
+        from comet_ml import Experiment
+        comet_experiment = Experiment(
+            api_key=comet_ml_api_key,
+            project_name=project,
+            workspace=comet_ml_workspace
+        )
+        hyper_params = {
+            "batch_size": batch_size,
+            "base_learning_rate": base_learning_rate,
+            "num_epochs": num_epochs,
+        }
+        comet_experiment.log_parameters(hyper_params)
+# --------------------------- Загрузка моделей ---------------------------
+vae = AutoencoderKLQwenImage.from_pretrained("vae", torch_dtype=dtype).to(device).to(dtype=dtype).eval()
+scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("scheduler")
+tokenizer = None
+text_encoder = None
+def load_text_encoder():
+    global tokenizer, text_encoder
+    if tokenizer is None:
+        tokenizer = Qwen3_5Tokenizer.from_pretrained("tokenizer")
+    if text_encoder is None:
+        text_encoder = Qwen3_5ForConditionalGeneration.from_pretrained(
+            "text_encoder",
+            torch_dtype=dtype
+        ).to(device).eval()
+load_text_encoder()
+@torch.no_grad()
+def encode_texts(text, max_length=max_length):
+    if text is None:
+        text = ""
+    if isinstance(text, str):
+        text = [text]
+    formatted_prompts = []
+    for t in text:
+        messages = [{"role": "user", "content": [{"type": "text", "text": t}]}]
+        formatted_prompts.append(
+            tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=False
+            )
+        )
+    toks = tokenizer(
+        formatted_prompts,
+        padding="max_length",
+        max_length=max_length,
+        truncation=True,
+        return_tensors="pt"
+    ).to(device)
+    outputs = text_encoder(
+        input_ids=toks.input_ids,
+        attention_mask=toks.attention_mask,
+        output_hidden_states=True
+    )
+    hidden = outputs.hidden_states[-2].to(dtype=dtype)
+    lengths = toks.attention_mask.sum(dim=1)
+    for i, length in enumerate(lengths):
+        hidden[i, length:] = 0
+    return hidden, toks.attention_mask.to(dtype=torch.int64)
+@torch.no_grad()
+def encode_texts_fast(text, max_length=max_length):
+    if text is None: text = ""
+    if isinstance(text, str): text = [text]
+    formatted_prompts = []
+    for t in text:
+        messages = [{"role": "user", "content": [{"type": "text", "text": t}]}]
+        formatted_prompts.append(tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False))
+    toks = tokenizer(formatted_prompts, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt").to(device)
+    outputs = text_encoder(input_ids=toks.input_ids, attention_mask=toks.attention_mask, output_hidden_states=True)
+    last_hidden = outputs.hidden_states[-2].to(dtype=dtype)
+    lengths = toks.attention_mask.sum(dim=1)
+    for i, length in enumerate(lengths):
+        last_hidden[i, length:] = 0
+    return last_hidden, toks.attention_mask.to(dtype=torch.int64)
+shift_factor = getattr(vae.config, "shift_factor", 0.0)
+if shift_factor is None:
+    shift_factor = 0.0
+scaling_factor = getattr(vae.config, "scaling_factor", 1.0)
+if scaling_factor is None:
+    scaling_factor = 1.0
+mean = getattr(vae.config, "latents_mean", None)
+std = getattr(vae.config, "latents_std", None)
+if mean is not None and std is not None:
+    latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1)
+    latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1)
+    # Внимание: Cosmos использует инвертированный std для декодирования (1.0 / std)
+    #latents_std = 1.0 / torch.tensor(std).view(1, len(std), 1, 1, 1)
+else:
+    latents_std = None
+    latents_mean = None
+if scheduler is not None:
+    scheduler.register_to_config(
+        sigma_max=getattr(scheduler.config, "sigma_max", 80.0),
+        sigma_min=getattr(scheduler.config, "sigma_min", 0.002),
+        sigma_data=getattr(scheduler.config, "sigma_data", 1.0),
+        final_sigmas_type=getattr(scheduler.config, "final_sigmas_type", "sigma_min"),
+    )
+import numpy as np
+from torch.utils.data import Sampler
+class DistributedResolutionBatchSampler(Sampler):
+    def __init__(self, dataset, batch_size, num_replicas, rank, drop_last=True, shuffle=True):
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.epoch = 0
+        self.batch_size = max(1, batch_size // num_replicas)
+        self.global_batch = self.batch_size * num_replicas
+        try:
+            widths = np.asarray(dataset["width"])
+            heights = np.asarray(dataset["height"])
+        except KeyError:
+            widths = np.zeros(len(dataset))
+            heights = np.zeros(len(dataset))
+        groups = {}
+        for i, (w, h) in enumerate(zip(widths, heights)):
+            groups.setdefault((w, h), []).append(i)
+        all_batches = []
+        for indices in groups.values():
+            idx = np.asarray(indices, dtype=np.int64)
+            num_batches = len(idx) // self.global_batch
+            if num_batches == 0:
+                continue
+            idx = idx[: num_batches * self.global_batch]
+            batches = idx.reshape(num_batches, self.global_batch)
+            all_batches.append(batches)
+        if len(all_batches) > 0:
+            self.global_batches = np.concatenate(all_batches, axis=0)
+        else:
+            self.global_batches = np.empty((0, self.global_batch), dtype=np.int64)
+        self.num_batches = len(self.global_batches)
+    def __iter__(self):
+        rng = np.random.RandomState(self.epoch)
+        order = np.arange(self.num_batches)
+        if self.shuffle:
+            rng.shuffle(order)
+        start = self.rank * self.batch_size
+        end = start + self.batch_size
+        for i in order:
+            yield self.global_batches[i][start:end]
+    def __len__(self):
+        return self.num_batches
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+def get_fixed_samples_by_resolution(dataset, samples_per_group=1):
+    size_groups = defaultdict(list)
+    try:
+        widths = dataset["width"]
+        heights = dataset["height"]
+    except KeyError:
+        widths = [0] * len(dataset)
+        heights = [0] * len(dataset)
+    for i, (w, h) in enumerate(zip(widths, heights)):
+        size = (w, h)
+        size_groups[size].append(i)
+    fixed_samples = {}
+    for size, indices in size_groups.items():
+        n_samples = min(samples_per_group, len(indices))
+        if len(size_groups)==1:
+            n_samples = samples_to_generate
+        if n_samples == 0:
+            continue
+        sample_indices = random.sample(indices, n_samples)
+        samples_data = [dataset[idx] for idx in sample_indices]
+        latents = torch.tensor(np.array([item["vae"] for item in samples_data])).to(device=device, dtype=dtype)
+        if latents.ndim == 4:
+            latents = latents.unsqueeze(2)
+        elif latents.ndim == 6:
+            latents = latents.squeeze(2)
+        texts = [item["text"] for item in samples_data]
+        if use_precomputed_embeddings:
+            embeddings = torch.tensor(
+                np.array([item["embeddings"] for item in samples_data]),
+                device=device,
+                dtype=dtype
+            )
+            masks = torch.tensor(
+                np.array([item["attention_mask"] for item in samples_data]),
+                device=device,
+                dtype=torch.int64
+            )
+        else:
+            embeddings, masks = encode_texts(texts,max_length)
+        fixed_samples[size] = (latents, embeddings, masks, texts)
+    print(f"Создано {len(fixed_samples)} групп фиксированных семплов по разрешениям")
+    return fixed_samples
+if limit > 0:
+    dataset = load_from_disk(ds_path).select(range(limit))
+else:
+    dataset = load_from_disk(ds_path)
+print(f"images: {len(dataset)}")
+def collate_fn_simple(batch):
+    latents = torch.from_numpy(
+        np.array([item["vae"] for item in batch], dtype=np.float16)
+    ).to(device, dtype=dtype)
+    if latents.ndim == 4:
+        latents = latents.unsqueeze(2)
+    elif latents.ndim == 6:
+        latents = latents.squeeze(2)
+    if use_precomputed_embeddings:
+        embeddings = torch.from_numpy(
+            np.array([item["embeddings"] for item in batch], dtype=np.float16)
+        ).to(device, dtype=dtype)
+        attention_mask = torch.from_numpy(
+            np.array([item["attention_mask"] for item in batch], dtype=np.int64)
+        ).to(device)
+        return latents, embeddings, attention_mask
+    raw_texts = [item["text"] for item in batch]
+    texts = [
+        "" if t.lower().startswith("zero")
+        else "" if random.random() < cfg_dropout
+        else t[1:].lstrip() if t.startswith(".")
+        else t.replace("The image shows ", "").replace("The image is ", "").replace("This image captures ","").strip()
+        for t in raw_texts
+    ]
+    embeddings, attention_mask = encode_texts(texts,max_length)
+    attention_mask = attention_mask.to(dtype=torch.int64)
+    return latents, embeddings, attention_mask
+batch_sampler = DistributedResolutionBatchSampler(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_replicas=accelerator.num_processes,
+        rank=accelerator.process_index,
+        shuffle = shuffle
+    )
+dataloader = DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_simple)
+if accelerator.is_main_process:
+    print("Total samples", len(dataloader))
+dataloader = accelerator.prepare(dataloader)
+start_epoch = 0
+global_step = 0
+total_training_steps = (len(dataloader) * num_epochs)
+world_size = accelerator.state.num_processes
+latest_checkpoint = os.path.join(checkpoints_folder, project)
+if os.path.isdir(latest_checkpoint):
+    print("Загружаем Transformer из чекпоинта:", latest_checkpoint)
+    transformer = CosmosTransformer3DModel.from_pretrained(latest_checkpoint).to(device=device, dtype=dtype)
+    if transformer_gradient:
+        transformer.enable_gradient_checkpointing()
+else:
+    raise FileNotFoundError(f"Transformer checkpoint not found at {latest_checkpoint}")
+def create_optimizer(name, params):
+    if name == "adam8bit":
+        return bnb.optim.AdamW8bit(
+            params, lr=base_learning_rate, betas=(0.9, betta2), eps=eps, weight_decay=0.001
+        )
+    elif name == "adam":
+        return torch.optim.AdamW(
+            params, lr=base_learning_rate, betas=(0.9, betta2), eps=eps, weight_decay=0.001
+        )
+    elif name == "adafactor":
+        return Adafactor(
+            params,
+            lr=base_learning_rate,
+            eps=(1e-30, 1e-3),
+            clip_threshold=1.0,
+            decay_rate=-0.8,
+            beta1=None,
+            weight_decay=0.001,
+            relative_step=False,
+            scale_parameter=False,
+            warmup_init=False
+        )
+    elif name == "muon_adam8bit":
+        return MuonAdamW8bit(
+            params,
+            lr=base_learning_rate,
+            betas=(0.9, betta2),
+            eps=eps,
+            weight_decay=0.01,
+            muon_lr_mult=muon_lr_scale,
+        )
+    else:
+        raise ValueError(f"Unknown optimizer: {name}")
+if fbp:
+    trainable_params = list(transformer.parameters())
+    optimizer_dict = {p: create_optimizer(optimizer_type, [p]) for p in trainable_params}
+    def optimizer_hook(param):
+        optimizer_dict[param].step()
+        optimizer_dict[param].zero_grad(set_to_none=True)
+    for param in trainable_params:
+        param.register_post_accumulate_grad_hook(optimizer_hook)
+    transformer, optimizer = accelerator.prepare(transformer, optimizer_dict)
+else:
+    #transformer.requires_grad_(True)
+    # 1. Сначала замораживаем ВООБЩЕ ВСЕ параметры
+    transformer.requires_grad_(False)
+    # 2. Определяем ключевое слово для слоев, которые нужно учить (Cross-Attention)
+    trainable_params_names = ["attn2"]
+    trainable_params = []
+    print("--- РАЗМОРОЖЕННЫЕ СЛОИ ---")
+    for name, param in transformer.named_parameters():
+        if any(target in name for target in trainable_params_names):
+            param.requires_grad_(True) # Размораживаем
+            trainable_params.append(param)
+            print(f"Обучаемый слой: {name}")
+    print("--------------------------")
+    # Защита от дурака
+    if len(trainable_params) == 0:
+        raise ValueError("Ошибка: ни один слой не был разморожен! Проверь ключи.")
+    optimizer = create_optimizer(optimizer_type, transformer.parameters())
+    def lr_schedule(step):
+        x = step / (total_training_steps * world_size)
+        warmup = warmup_percent
+        if not use_decay:
+            return base_learning_rate
+        if x < warmup:
+            return min_learning_rate + (base_learning_rate - min_learning_rate) * (x / warmup)
+        decay_ratio = (x - warmup) / (1 - warmup)
+        return min_learning_rate + 0.5 * (base_learning_rate - min_learning_rate) * \
+               (1 + math.cos(math.pi * decay_ratio))
+    lr_scheduler = LambdaLR(optimizer, lambda step: lr_schedule(step) / base_learning_rate)
+if torch_compile:
+    print("Compiling Transformer... Это займет несколько минут, не прерывайте!")
+    transformer = torch.compile(transformer)
+    print("Compiling - ok")
+if not fbp:
+    transformer, optimizer, lr_scheduler = accelerator.prepare(transformer, optimizer, lr_scheduler)
+# Фиксированные семплы
+fixed_samples = get_fixed_samples_by_resolution(dataset)
+def get_negative_embedding(neg_prompt="", batch_size=1):
+    if not neg_prompt:
+        hidden_dim = 2048
+        seq_len = max_length
+        empty_emb = torch.zeros((batch_size, seq_len, hidden_dim), dtype=dtype, device=device)
+        empty_mask = torch.ones((batch_size, seq_len), dtype=torch.int64, device=device)
+        return empty_emb, empty_mask
+    uncond_emb, uncond_mask  = encode_texts([neg_prompt],max_length)
+    uncond_emb = uncond_emb.to(dtype=dtype, device=device).repeat(batch_size, 1, 1)
+    uncond_mask = uncond_mask.to(device=device).repeat(batch_size, 1)
+    return uncond_emb, uncond_mask
+if use_precomputed_embeddings:
+    load_text_encoder()
+    uncond_emb, uncond_mask = get_negative_embedding("low quality")
+    uncond_emb = uncond_emb.to("cpu")
+    uncond_mask = uncond_mask.to("cpu")
+    del text_encoder
+    torch.cuda.empty_cache()
+    gc.collect()
+    text_encoder = None
+else:
+    uncond_emb, uncond_mask = get_negative_embedding("low quality")
+def pad_to_match(a, b, pad_value=0):
+    Ta, Tb = a.shape[1], b.shape[1]
+    if Ta == Tb:
+        return a, b
+    T = max(Ta, Tb)
+    def pad(x, T_target):
+        pad_len = T_target - x.shape[1]
+        if pad_len <= 0:
+            return x
+        return torch.nn.functional.pad(x, (0, 0, 0, pad_len), value=pad_value)
+    return pad(a, T), pad(b, T)
+@torch.compiler.disable()
+@torch.no_grad()
+def generate_and_save_samples(fixed_samples_cpu, uncond_data, step):
+    uncond_emb, uncond_mask = uncond_data
+    uncond_emb = uncond_emb.to(device)
+    uncond_mask = uncond_mask.to(device)
+    original_model = None
+    try:
+        if not torch_compile:
+            original_model = accelerator.unwrap_model(transformer, keep_torch_compile=True).eval()
+        else:
+            original_model = transformer.eval()
+        vae.to(device=device).eval()
+        all_generated_images = []
+        all_captions = []
+        for size, (sample_latents, sample_text_embeddings, sample_mask, sample_text) in fixed_samples_cpu.items():
+            width, height = size
+            curr_batch_size = sample_latents.shape[0]
+            in_channels = original_model.config.in_channels
+            sample_text_embeddings = sample_text_embeddings.to(dtype=dtype, device=device)
+            sigmas_dtype = torch.float32
+            sigmas = torch.linspace(0, 1, n_diffusion_steps, dtype=sigmas_dtype)
+            scheduler.set_timesteps(sigmas=sigmas, device=device)
+            if scheduler.config.get("final_sigmas_type", "zero") == "sigma_min":
+                scheduler.sigmas[-1] = scheduler.sigmas[-2]
+            if scheduler.sigmas[-1] == 0.0:
+                scheduler.sigmas[-1] = 1e-4
+            sigma_max = getattr(scheduler.config, "sigma_max", 80.0)
+            latents = torch.randn(
+                (curr_batch_size, in_channels, 1, sample_latents.shape[3], sample_latents.shape[4]),
+                device=device,
+                dtype=dtype,
+                generator=torch.Generator(device=device).manual_seed(seed)
+            ) * sigma_max
+            padding_mask = torch.zeros((1, 1, sample_latents.shape[3], sample_latents.shape[4]), device=device, dtype=dtype)
+            if guidance_scale != 1:
+                neg_emb_batch = uncond_emb[0:1].expand(curr_batch_size, -1, -1)
+                neg_emb_batch, sample_text_embeddings = pad_to_match(neg_emb_batch, sample_text_embeddings)
+            for i, t in enumerate(scheduler.timesteps):
+                current_sigma = scheduler.sigmas[i]
+                if current_sigma == 0.0:
+                    current_sigma = torch.tensor(1e-4, dtype=current_sigma.dtype, device=device)
+                current_t = current_sigma / (current_sigma + 1.0)
+                c_in = 1.0 - current_t
+                c_skip = 1.0 - current_t
+                c_out = -current_t
+                latent_model_input = (latents * c_in).to(dtype)
+                t_val = float(current_t.item()) if torch.is_tensor(current_t) else float(current_t)
+                timestep_tensor = torch.tensor([t_val], device=device, dtype=dtype).expand(curr_batch_size)
+                noise_pred = original_model(
+                    hidden_states=latent_model_input,
+                    timestep=timestep_tensor,
+                    encoder_hidden_states=sample_text_embeddings,
+                    padding_mask=padding_mask,
+                    return_dict=False
+                )[0]
+                noise_pred = (c_skip * latents + c_out * noise_pred.float()).to(dtype)
+                if guidance_scale != 1:
+                    noise_pred_uncond = original_model(
+                        hidden_states=latent_model_input,
+                        timestep=timestep_tensor,
+                        encoder_hidden_states=neg_emb_batch,
+                        padding_mask=padding_mask,
+                        return_dict=False
+                    )[0]
+                    noise_pred_uncond = (c_skip * latents + c_out * noise_pred_uncond.float()).to(dtype)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+                noise_pred = (latents - noise_pred) / current_sigma
+                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            current_latents = latents
+            if step == 0:
+                current_latents = sample_latents
+            if latents_mean is not None and latents_std is not None:
+                sigma_data = getattr(scheduler.config, "sigma_data", 1.0)
+                # Переводим векторы нормализации в float32
+                l_mean = torch.tensor(vae.config.latents_mean).view(1, -1, 1, 1, 1).to(device, torch.float32)
+                l_std = torch.tensor(vae.config.latents_std).view(1, -1, 1, 1, 1).to(device, torch.float32)
+                # Кастуем латенты в float32 перед умножением, чтобы сохранить точность
+                latents_for_decode = (current_latents.to(torch.float32) * l_std) / sigma_data + l_mean
+            else:
+                latents_for_decode = current_latents.to(torch.float32)
+            # 2. Декодируем, ПРИНУДИТЕЛЬНО ВКЛЮЧИВ MATH_SDP только для этого шага!
+            with torch.backends.cuda.sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+                decoded = vae.decode(latents_for_decode).sample
+            # 3. Отсекаем лишнее видео-измерение
+            if decoded.ndim == 5:
+                decoded = decoded[:, :, 0, :, :]
+            # 4. Он уже во float32, можно сразу пускать в цикл
+            decoded_fp32 = decoded
+            for img_idx, img_tensor in enumerate(decoded_fp32):
+                img = (img_tensor / 2 + 0.5).clamp(0, 1).cpu().numpy()
+                img = img.transpose(1, 2, 0)
+                if np.isnan(img).any():
+                    print("NaNs found, saving stopped! Step:", step)
+                    img = np.nan_to_num(img, nan=0.0)
+                pil_img = Image.fromarray((img * 255).astype("uint8"))
+                max_w_overall = max(s[0] for s in fixed_samples_cpu.keys())
+                max_h_overall = max(s[1] for s in fixed_samples_cpu.keys())
+                max_w_overall = max(255, max_w_overall)
+                max_h_overall = max(255, max_h_overall)
+                padded_img = ImageOps.pad(pil_img, (max_w_overall, max_h_overall), color='white')
+                all_generated_images.append(padded_img)
+                caption_text = sample_text[img_idx][:300] if img_idx < len(sample_text) else ""
+                all_captions.append(caption_text)
+                sample_path = f"{generated_folder}/{project}_{width}x{height}_{img_idx}.jpg"
+                pil_img.save(sample_path, "JPEG", quality=95)
+        if use_wandb and accelerator.is_main_process:
+            wandb_images = [
+                wandb.Image(img, caption=f"{all_captions[i]}")
+                for i, img in enumerate(all_generated_images)
+            ]
+            wandb.log({"generated_images": wandb_images})
+        if use_comet_ml and accelerator.is_main_process:
+            for i, img in enumerate(all_generated_images):
+                comet_experiment.log_image(
+                    image_data=img,
+                    name=f"step_{step}_img_{i}",
+                    step=step,
+                    metadata={"caption": all_captions[i]}
+                )
+    finally:
+        vae.to("cpu")
+        uncond_emb = uncond_emb.to("cpu")
+        uncond_mask = uncond_mask.to("cpu")
+        try:
+            all_generated_images.clear()
+            all_captions.clear()
+            del all_generated_images, all_captions
+            del latents, current_latents, latent_model_input
+            del decoded, decoded_fp32
+            del sample_latents, sample_text_embeddings, sample_mask
+            del noise_pred, noise_pred_uncond
+        except UnboundLocalError:
+            pass
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        gc.collect()
+if accelerator.is_main_process:
+    if save_model:
+        print("Генерация сэмплов до старта обучения...")
+        generate_and_save_samples(fixed_samples, (uncond_emb, uncond_mask), 0)
+accelerator.wait_for_everyone()
+def save_checkpoint(model_net, variant=""):
+    if accelerator.is_main_process:
+        model_to_save = None
+        if not torch_compile:
+            model_to_save = accelerator.unwrap_model(model_net)
+        else:
+            model_to_save = model_net
+        if variant != "":
+            model_to_save.to(dtype=torch.bfloat16).save_pretrained(
+                os.path.join(checkpoints_folder, f"{project}"), variant=variant
+            )
+        else:
+            model_to_save.save_pretrained(os.path.join(checkpoints_folder, f"{project}"))
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        gc.collect()
+if accelerator.is_main_process:
+    print(f"Total steps per GPU: {total_training_steps}")
+epoch_loss_points = []
+progress_bar = tqdm(total=total_training_steps, disable=not accelerator.is_local_main_process, desc="Training", unit="step")
+steps_per_epoch = len(dataloader)
+sink_interval = max(1, steps_per_epoch // sink_interval_share)
+min_loss = 4.
+last_sample_time = time.time()
+sample_interval_seconds = sample_interval_min * 60
+for epoch in range(start_epoch, start_epoch + num_epochs):
+    batch_losses = []
+    batch_grads = []
+    batch_sampler.set_epoch(epoch)
+    accelerator.wait_for_everyone()
+    transformer.train()
+    for step, (latents, embeddings, attention_mask) in enumerate(dataloader):
+        if save_model == False and epoch == 0 and step == 5 :
+            used_gb = torch.cuda.max_memory_allocated() / 1024**3
+            print(f"Шаг {step}: {used_gb:.2f} GB")
+        amp_context = accelerator.autocast() if torch_compile else nullcontext()
+        with accelerator.accumulate(transformer):
+            with amp_context:
+                noise = torch.randn_like(latents, dtype=latents.dtype)
+                t = torch.sigmoid(torch.randn(latents.shape[0], device=latents.device, dtype=latents.dtype) + sigmoid_bias)
+                noisy_latents_5d = (1.0 - t.view(-1, 1, 1, 1, 1)) * latents + t.view(-1, 1, 1, 1, 1) * noise
+                target_5d = noise - latents
+                padding_mask = torch.zeros((1, 1, latents.shape[3], latents.shape[4]), device=device, dtype=dtype)
+                timestep_tensor = t.flatten().to(dtype)
+                model_pred = transformer(
+                    hidden_states=noisy_latents_5d,
+                    timestep=timestep_tensor,
+                    encoder_hidden_states=embeddings,
+                    padding_mask=padding_mask,
+                    return_dict=False
+                )[0]
+                mse_loss = F.mse_loss(model_pred.float(), target_5d.float())
+                batch_losses.append(mse_loss.detach().item())
+                if (global_step % 100 == 0) or (global_step % sink_interval == 0):
+                    accelerator.wait_for_everyone()
+                losses_dict = {}
+                losses_dict["mse"] = mse_loss
+                if (global_step % 100 == 0) or (global_step % sink_interval == 0):
+                    accelerator.wait_for_everyone()
+                accelerator.backward(mse_loss)
+                if (global_step % 100 == 0) or (global_step % sink_interval == 0):
+                    accelerator.wait_for_everyone()
+                grad = 0.0
+                if not fbp:
+                    if accelerator.sync_gradients:
+                        grad_val = accelerator.clip_grad_norm_(transformer.parameters(), clip_grad_norm)
+                        grad = grad_val.float().item() if torch.is_tensor(grad_val) else float(grad_val)
+                        optimizer.step()
+                        lr_scheduler.step()
+                        optimizer.zero_grad(set_to_none=True)
+                if accelerator.sync_gradients:
+                    global_step += 1
+                    progress_bar.update(1)
+                    if accelerator.is_main_process:
+                        if fbp:
+                            current_lr = base_learning_rate
+                        else:
+                            current_lr = lr_scheduler.get_last_lr()[0]
+                        batch_grads.append(grad)
+                        log_data = {}
+                        log_data["loss_mse"] = mse_loss.detach().item()
+                        log_data["lr"] = current_lr
+                        log_data["grad"] = grad
+                        if accelerator.sync_gradients:
+                            if use_wandb:
+                                wandb.log(log_data, step=global_step)
+                            if use_comet_ml:
+                                comet_experiment.log_metrics(log_data, step=global_step)
+                        current_time = time.time()
+                        is_time_to_sample = (current_time - last_sample_time) >= sample_interval_seconds
+                        if is_time_to_sample or global_step == 50:
+                            if save_model:
+                                generate_and_save_samples(fixed_samples, (uncond_emb, uncond_mask), global_step)
+                            elif epoch % 10 == 0:
+                                generate_and_save_samples(fixed_samples, (uncond_emb, uncond_mask), global_step)
+                            last_n = sink_interval
+                            if save_model:
+                                has_losses = len(batch_losses) > 0
+                                avg_sample_loss = np.mean(batch_losses[-sink_interval:]) if has_losses else 0.0
+                                last_loss = batch_losses[-1] if has_losses else 0.0
+                                max_loss = max(avg_sample_loss, last_loss)
+                                should_save = max_loss < min_loss * save_barrier
+                                print(
+                                    f"Saving: {should_save} | Max: {max_loss:.4f} | "
+                                    f"Last: {last_loss:.4f} | Avg: {avg_sample_loss:.4f}"
+                                )
+                                if should_save:
+                                    min_loss = max_loss
+                                    save_checkpoint(transformer)
+                            last_sample_time = current_time
+                            transformer.train()
+    if accelerator.is_main_process:
+        avg_epoch_loss = np.mean(batch_losses) if len(batch_losses) > 0 else 0.0
+        avg_epoch_grad = np.mean(batch_grads) if len(batch_grads) > 0 else 0.0
+        print(f"\nЭпоха {epoch} завершена. Средний лосс: {avg_epoch_loss:.6f}")
+        log_data_ep = {
+                        "epoch_loss": avg_epoch_loss,
+                        "epoch_grad": avg_epoch_grad,
+                        "epoch": epoch + 1,
+                    }
+        if use_wandb:
+            wandb.log(log_data_ep)
+        if use_comet_ml:
+            comet_experiment.log_metrics(log_data_ep)
+if accelerator.is_main_process:
+    print("Обучение завершено! Сохраняем финальную модель...")
+    save_checkpoint(transformer,"bf16")
+    if use_comet_ml:
+        comet_experiment.end()
+accelerator.free_memory()
+if torch.distributed.is_initialized():
+    torch.distributed.destroy_process_group()
+print("Готово!")

transformer/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_class_name": "CosmosTransformer3DModel",
+  "_diffusers_version": "0.37.1",
+  "_name_or_path": "transformer",
+  "adaln_lora_dim": 256,
+  "attention_head_dim": 128,
+  "concat_padding_mask": true,
+  "controlnet_block_every_n": null,
+  "crossattn_proj_in_channels": 1024,
+  "encoder_hidden_states_channels": 1024,
+  "extra_pos_embed_type": null,
+  "img_context_dim_in": null,
+  "img_context_dim_out": 2048,
+  "img_context_num_tokens": 256,
+  "in_channels": 16,
+  "max_size": [
+    128,
+    240,
+    240
+  ],
+  "mlp_ratio": 4.0,
+  "num_attention_heads": 16,
+  "num_layers": 28,
+  "out_channels": 16,
+  "patch_size": [
+    1,
+    2,
+    2
+  ],
+  "rope_scale": [
+    1.0,
+    4.0,
+    4.0
+  ],
+  "text_embed_dim": 1024,
+  "use_crossattn_projection": false
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:501d3b67f235189364d1bbeb3862fcdfc74e957f033e0714e8c2a12ba95a7041
+size 7825687184

vae/.ipynb_checkpoints/config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": "AutoencoderKLQwenImage",
+  "_diffusers_version": "0.36.0.dev0",
+  "attn_scales": [],
+  "base_dim": 96,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": "AutoencoderKLQwenImage",
+  "_diffusers_version": "0.36.0.dev0",
+  "attn_scales": [],
+  "base_dim": 96,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8bc8b758c649abef9ea407b95408389a3b2f610d0d10fcb054fe171d0a8344
+size 253806966

wandb/debug-cli.root.log ADDED Viewed

File without changes

wandb/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,19 @@

+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_setup.py:_flush():81] Current SDK version is 0.26.1
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_setup.py:_flush():81] Configure stats pid to 14112
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:setup_run_log_directory():723] Logging user logs to /root/sdxs-2b/wandb/run-20260428_171645-wt40fdyx/logs/debug.log
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:setup_run_log_directory():724] Logging internal logs to /root/sdxs-2b/wandb/run-20260428_171645-wt40fdyx/logs/debug-internal.log
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:init():850] calling init triggers
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:init():855] wandb.init called with sweep_config: {}
+config: {'batch_size': 16, 'base_learning_rate': 1.3333333333333335e-05, 'num_epochs': 1, 'optimizer_type': 'adafactor', '_wandb': {}}
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:init():898] starting backend
+2026-04-28 17:16:45,343 INFO    MainThread:14112 [wandb_init.py:init():913] sending inform_init request
+2026-04-28 17:16:45,731 INFO    MainThread:14112 [wandb_init.py:init():918] backend started and connected
+2026-04-28 17:16:45,734 INFO    MainThread:14112 [wandb_init.py:init():988] updated telemetry
+2026-04-28 17:16:45,742 INFO    MainThread:14112 [wandb_init.py:init():1011] communicating run to backend with 90.0 second timeout
+2026-04-28 17:16:46,973 INFO    MainThread:14112 [wandb_init.py:init():1056] starting run threads in backend
+2026-04-28 17:16:47,099 INFO    MainThread:14112 [wandb_run.py:_console_start():2554] atexit reg
+2026-04-28 17:16:47,099 INFO    MainThread:14112 [wandb_run.py:_redirect():2403] redirect: wrap_raw
+2026-04-28 17:16:47,100 INFO    MainThread:14112 [wandb_run.py:_redirect():2472] Wrapping output streams.
+2026-04-28 17:16:47,100 INFO    MainThread:14112 [wandb_run.py:_redirect():2495] Redirects installed.
+2026-04-28 17:16:47,104 INFO    MainThread:14112 [wandb_init.py:init():1094] run started, returning control to user process

wandb/offline-run-20260428_132658-o9052r27/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,117 @@

+cuda-toolkit==13.0.2
+typing_extensions==4.15.0
+nvidia-nvjitlink==13.0.88
+MarkupSafe==3.0.3
+nvidia-cufile==1.15.1.6
+certifi==2026.4.22
+nvidia-cusolver==12.0.4.66
+nvidia-curand==10.4.0.35
+Jinja2==3.1.6
+nvidia-nvtx==13.0.85
+nvidia-cuda-cupti==13.0.85
+torchaudio==2.11.0+cu130
+safetensors==0.7.0
+nvidia-cuda-runtime==13.0.96
+torchvision==0.26.0+cu130
+nvidia-cufft==12.0.0.61
+nvidia-cusparse==12.6.3.3
+nvidia-cuda-nvrtc==13.0.88
+fsspec==2026.2.0
+nvidia-cusparselt-cu13==0.8.0
+nvidia-nccl-cu13==2.28.9
+nvidia-nvshmem-cu13==3.4.5
+nvidia-cublas==13.1.0.3
+nvidia-cudnn-cu13==9.19.0.56
+mpmath==1.3.0
+triton==3.6.0
+networkx==3.6.1
+sympy==1.14.0
+torch==2.11.0+cu130
+hf_transfer==0.1.9
+six==1.17.0
+typer==0.25.0
+typing-inspection==0.4.2
+muon-adamw8bit==0.5.0
+aiosignal==1.4.0
+wurlitzer==3.1.1
+semantic-version==2.10.0
+aiohappyeyeballs==2.6.1
+cycler==0.12.1
+tokenizers==0.22.2
+annotated-doc==0.0.4
+rpds-py==0.30.0
+configobj==5.0.9
+regex==2026.4.4
+zipp==3.23.1
+annotated-types==0.7.0
+everett==3.1.0
+pydantic_core==2.46.3
+mdurl==0.1.2
+platformdirs==4.9.6
+idna==3.13
+psutil==7.2.2
+xxhash==3.7.0
+smmap==5.0.3
+frozenlist==1.8.0
+multidict==6.7.1
+shellingham==1.5.4
+kiwisolver==1.5.0
+propcache==0.4.1
+h11==0.16.0
+hf-xet==1.4.3
+pyparsing==3.3.2
+yarl==1.23.0
+importlib_metadata==9.0.0
+referencing==0.37.0
+requests==2.33.1
+filelock==3.29.0
+charset-normalizer==3.4.7
+wrapt==2.1.2
+contourpy==1.3.3
+python-box==6.1.0
+python-dateutil==2.9.0.post0
+packaging==26.2
+httpx==0.28.1
+PyYAML==6.0.3
+click==8.3.3
+jsonschema-specifications==2025.9.1
+gitdb==4.0.12
+einops==0.8.2
+attrs==26.1.0
+httpcore==1.0.9
+cuda-pathfinder==1.5.4
+requests-toolbelt==1.0.0
+GitPython==3.1.48
+jsonschema==4.26.0
+tqdm==4.67.3
+urllib3==2.6.3
+anyio==4.13.0
+simplejson==4.1.1
+multiprocess==0.70.19
+dill==0.4.1
+protobuf==7.34.1
+markdown-it-py==4.0.0
+bitsandbytes==0.49.2
+cuda-bindings==13.2.0
+aiohttp==3.13.5
+accelerate==1.13.0
+dulwich==0.25.2
+pydantic==2.13.3
+datasets==4.8.5
+rich==15.0.0
+flash-linear-attention==0.5.0
+pillow==12.2.0
+huggingface_hub==1.12.0
+sentry-sdk==2.58.0
+fla-core==0.5.0
+Pygments==2.20.0
+diffusers==0.37.1
+fonttools==4.62.1
+comet_ml==3.57.3
+setuptools==81.0.0
+matplotlib==3.10.9
+pyarrow==24.0.0
+wandb==0.26.1
+numpy==2.4.4
+pandas==3.0.2
+transformers==5.6.2

wandb/offline-run-20260428_132658-o9052r27/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,14 @@

+{"time":"2026-04-28T13:26:58.701599632Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpll17c794/port-6681.txt","pid":6681,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-04-28T13:26:58.704326543Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":6681}
+{"time":"2026-04-28T13:26:58.70424692Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-6681-6712-3956627621/socket","Net":"unix"}}
+{"time":"2026-04-28T13:26:58.806869406Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-04-28T13:26:58.828765063Z","level":"INFO","msg":"handleInformInit: received","streamId":"o9052r27","id":"1(@)"}
+{"time":"2026-04-28T13:26:58.960660655Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"o9052r27","id":"1(@)"}
+{"time":"2026-04-28T13:27:04.392467558Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
+{"time":"2026-04-28T13:27:04.392527721Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2026-04-28T13:27:04.392535141Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
+{"time":"2026-04-28T13:27:04.392635535Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
+{"time":"2026-04-28T13:27:04.392627225Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-6681-6712-3956627621/socket","Net":"unix"}}
+{"time":"2026-04-28T13:27:04.421552415Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
+{"time":"2026-04-28T13:27:04.421573556Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
+{"time":"2026-04-28T13:27:04.421579966Z","level":"INFO","msg":"server is closed"}

wandb/offline-run-20260428_132658-o9052r27/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,15 @@

+{"time":"2026-04-28T13:26:58.829048314Z","level":"INFO","msg":"wandb-core"}
+{"time":"2026-04-28T13:26:58.829092766Z","level":"INFO","msg":"stream: starting","core version":"0.26.1"}
+{"time":"2026-04-28T13:26:58.960323061Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
+{"time":"2026-04-28T13:26:58.960354402Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
+{"time":"2026-04-28T13:26:58.960391424Z","level":"INFO","msg":"stream: created new stream","id":"o9052r27"}
+{"time":"2026-04-28T13:26:58.960480497Z","level":"INFO","msg":"handler: started"}
+{"time":"2026-04-28T13:26:58.960646764Z","level":"INFO","msg":"stream: started"}
+{"time":"2026-04-28T13:26:58.960704477Z","level":"INFO","msg":"writer: started","stream_id":"o9052r27"}
+{"time":"2026-04-28T13:26:58.960767929Z","level":"INFO","msg":"sender: started"}
+{"time":"2026-04-28T13:26:58.975123911Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
+{"time":"2026-04-28T13:26:58.975175533Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
+{"time":"2026-04-28T13:27:04.392744599Z","level":"INFO","msg":"stream: finishing up"}
+{"time":"2026-04-28T13:27:04.39276658Z","level":"INFO","msg":"handler: closed"}
+{"time":"2026-04-28T13:27:04.392811252Z","level":"INFO","msg":"sender: closed"}
+{"time":"2026-04-28T13:27:04.392819012Z","level":"INFO","msg":"stream: all finished"}

wandb/offline-run-20260428_132658-o9052r27/logs/debug.log ADDED Viewed

	@@ -0,0 +1,21 @@

+2026-04-28 13:26:58,591 INFO    MainThread:6681 [wandb_setup.py:_flush():81] Current SDK version is 0.26.1
+2026-04-28 13:26:58,591 INFO    MainThread:6681 [wandb_setup.py:_flush():81] Configure stats pid to 6681
+2026-04-28 13:26:58,591 INFO    MainThread:6681 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-04-28 13:26:58,591 INFO    MainThread:6681 [wandb_init.py:setup_run_log_directory():723] Logging user logs to /root/sdxs-2b/wandb/offline-run-20260428_132658-o9052r27/logs/debug.log
+2026-04-28 13:26:58,592 INFO    MainThread:6681 [wandb_init.py:setup_run_log_directory():724] Logging internal logs to /root/sdxs-2b/wandb/offline-run-20260428_132658-o9052r27/logs/debug-internal.log
+2026-04-28 13:26:58,592 INFO    MainThread:6681 [wandb_init.py:init():850] calling init triggers
+2026-04-28 13:26:58,592 INFO    MainThread:6681 [wandb_init.py:init():855] wandb.init called with sweep_config: {}
+config: {'batch_size': 7, 'base_learning_rate': 1.3333333333333335e-05, 'num_epochs': 1, 'optimizer_type': 'adafactor', '_wandb': {}}
+2026-04-28 13:26:58,592 INFO    MainThread:6681 [wandb_init.py:init():898] starting backend
+2026-04-28 13:26:58,807 INFO    MainThread:6681 [wandb_init.py:init():913] sending inform_init request
+2026-04-28 13:26:58,961 INFO    MainThread:6681 [wandb_init.py:init():918] backend started and connected
+2026-04-28 13:26:58,964 INFO    MainThread:6681 [wandb_init.py:init():988] updated telemetry
+2026-04-28 13:26:58,971 INFO    MainThread:6681 [wandb_init.py:init():1011] communicating run to backend with 90.0 second timeout
+2026-04-28 13:26:58,977 INFO    MainThread:6681 [wandb_init.py:init():1056] starting run threads in backend
+2026-04-28 13:26:59,098 INFO    MainThread:6681 [wandb_run.py:_console_start():2554] atexit reg
+2026-04-28 13:26:59,098 INFO    MainThread:6681 [wandb_run.py:_redirect():2403] redirect: wrap_raw
+2026-04-28 13:26:59,099 INFO    MainThread:6681 [wandb_run.py:_redirect():2472] Wrapping output streams.
+2026-04-28 13:26:59,099 INFO    MainThread:6681 [wandb_run.py:_redirect():2495] Redirects installed.
+2026-04-28 13:26:59,115 INFO    MainThread:6681 [wandb_init.py:init():1094] run started, returning control to user process
+2026-04-28 13:27:04,393 INFO    wandb-AsyncioManager-main:6681 [service_client.py:_forward_responses():134] Reached EOF.
+2026-04-28 13:27:04,393 INFO    wandb-AsyncioManager-main:6681 [mailbox.py:close():155] Closing mailbox, abandoning 0 handles.

wandb/offline-run-20260428_132658-o9052r27/run-o9052r27.wandb ADDED Viewed

Binary file (6.41 kB). View file

wandb/run-20260428_171645-wt40fdyx/files/output.log ADDED Viewed

	@@ -0,0 +1,385 @@

+The config attributes {'final_sigmas_type': 'sigma_min', 'sigma_data': 1.0, 'sigma_max': 80.0, 'sigma_min': 0.002} were passed to FlowMatchEulerDiscreteScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.
+[transformers] The fast path is not available because one of the required library is not installed. Falling back to torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and https://github.com/Dao-AILab/causal-conv1d
+Loading weights: 100%|████████████████████████████████████████████████████████████████████████████████████| 473/473 [00:00<00:00, 766.86it/s]
+images: 233407
+Total samples 14580
+Загружаем Transformer из чекпоинта: transformer
+--- РАЗМОРОЖЕННЫЕ СЛОИ ---
+--------------------------
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 44, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 44, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 48, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 48, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 40])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 40])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 44])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 44])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 48])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 48])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 52])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 52])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 56])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 56])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 52, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 52, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 56, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 56, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 60, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 60, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 60])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 60])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 64, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 64, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 68, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 68, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 72, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 72, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 76, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 76, 80])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 64])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 64])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 68])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 68])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 72])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 72])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 80, 76])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 80, 76])
+[ПОЧИНКА FIXED SAMPLES] Отсекаем мусор: torch.Size([1, 16, 16, 40, 80])
+[ОТЛАДКА ДАТАСЕТА] latents final shape: torch.Size([1, 16, 1, 40, 80])
+Создано 20 групп фиксированных семплов по разрешениям
+Генерация сэмплов до старта обучения...
+/usr/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
+  self.gen = func(*args, **kwds)
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 44, 80])
+   min=-1.9811, max=2.2364, std=0.6226
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 44, 80])
+   min=-3.9773, max=3.4072, std=1.1547
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 352, 640])
+   min=-1.0000, max=0.9945, std=0.6423
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 48, 80])
+   min=-2.1993, max=1.9178, std=0.5500
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 48, 80])
+   min=-3.5397, max=3.3824, std=1.0561
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 384, 640])
+   min=-1.0000, max=1.0000, std=0.3971
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 40])
+   min=-2.7174, max=2.0244, std=0.6368
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 40])
+   min=-4.0544, max=4.0678, std=1.1537
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 320])
+   min=-0.9997, max=1.0000, std=0.5404
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 44])
+   min=-2.0394, max=2.0944, std=0.5736
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 44])
+   min=-3.8287, max=3.3714, std=1.0290
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 352])
+   min=-1.0000, max=1.0000, std=0.4719
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 48])
+   min=-2.0441, max=1.9221, std=0.5108
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 48])
+   min=-3.4324, max=3.7347, std=0.9750
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 384])
+   min=-1.0000, max=1.0000, std=0.5049
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 52])
+   min=-2.0292, max=2.2682, std=0.7043
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 52])
+   min=-4.1673, max=4.4971, std=1.3949
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 416])
+   min=-1.0000, max=1.0000, std=0.6222
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 56])
+   min=-1.7528, max=1.6711, std=0.6432
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 56])
+   min=-4.0104, max=4.1834, std=1.4406
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 448])
+   min=-0.9654, max=1.0000, std=0.4818
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 52, 80])
+   min=-2.0965, max=2.2269, std=0.4286
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 52, 80])
+   min=-3.3608, max=2.9338, std=0.9200
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 416, 640])
+   min=-1.0000, max=0.9774, std=0.3019
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 56, 80])
+   min=-2.3215, max=2.6622, std=0.6174
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 56, 80])
+   min=-3.6939, max=4.7696, std=1.3130
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 448, 640])
+   min=-1.0000, max=1.0000, std=0.4811
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 60, 80])
+   min=-2.2899, max=2.1393, std=0.5506
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 60, 80])
+   min=-4.0351, max=4.0100, std=1.1577
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 480, 640])
+   min=-1.0000, max=1.0000, std=0.6317
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 60])
+   min=-1.8058, max=2.0032, std=0.5188
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 60])
+   min=-3.2342, max=3.6659, std=1.0352
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 480])
+   min=-1.0000, max=1.0000, std=0.6372
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 64, 80])
+   min=-2.1774, max=2.1568, std=0.6666
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 64, 80])
+   min=-4.7810, max=5.1935, std=1.3580
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 512, 640])
+   min=-1.0000, max=1.0000, std=0.5784
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 68, 80])
+   min=-1.9091, max=2.1057, std=0.5661
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 68, 80])
+   min=-3.4599, max=3.7540, std=1.0538
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 544, 640])
+   min=-1.0000, max=1.0000, std=0.6665
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 72, 80])
+   min=-2.1917, max=2.3725, std=0.6957
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 72, 80])
+   min=-3.8205, max=4.1090, std=1.5053
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 576, 640])
+   min=-1.0000, max=1.0000, std=0.6376
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 76, 80])
+   min=-2.3168, max=2.0439, std=0.6811
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 76, 80])
+   min=-3.8838, max=4.5797, std=1.3369
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 608, 640])
+   min=-1.0000, max=1.0000, std=0.6667
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 64])
+   min=-2.2767, max=2.3007, std=0.5141
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 64])
+   min=-3.7021, max=3.3769, std=0.8752
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 512])
+   min=-1.0000, max=1.0000, std=0.4680
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 68])
+   min=-2.3068, max=2.3424, std=0.7115
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 68])
+   min=-3.9636, max=4.6402, std=1.4684
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 544])
+   min=-0.9553, max=1.0000, std=0.4083
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 72])
+   min=-2.3526, max=2.5922, std=0.7641
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 72])
+   min=-4.1452, max=4.7889, std=1.6258
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 576])
+   min=-1.0000, max=1.0000, std=0.7539
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 80, 76])
+   min=-1.7528, max=1.9838, std=0.4715
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 80, 76])
+   min=-3.3567, max=3.4733, std=1.0891
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 640, 608])
+   min=-0.9685, max=0.9913, std=0.4626
+==================================================
+==================================================
+[ОТЛАДКА VAE DECODE НА НУЛЕВОМ ШАГЕ]
+1. current_latents: shape=torch.Size([1, 16, 1, 40, 80])
+   min=-2.1861, max=2.0078, std=0.5870
+2. l_std shape=torch.Size([1, 16, 1, 1, 1]), l_mean shape=torch.Size([1, 16, 1, 1, 1])
+   sigma_data=1.0
+3. latents_for_decode: shape=torch.Size([1, 16, 1, 40, 80])
+   min=-4.1700, max=4.9017, std=1.3442
+4. decoded_fp32 (после VAE): shape=torch.Size([1, 3, 320, 640])
+   min=-0.8055, max=0.9961, std=0.3967
+==================================================
+Total steps per GPU: 14580
+Training:  36%|████████████████████████████▉                                                    | 5205/14580 [15:41:32<27:32:51, 10.58s/step]
+Saving: True | Max: 0.1232 | Last: 0.1094 | Avg: 0.1232
+Saving: True | Max: 0.1423 | Last: 0.1423 | Avg: 0.1245
+Saving: True | Max: 0.1248 | Last: 0.0964 | Avg: 0.1248
+Saving: True | Max: 0.1423 | Last: 0.1423 | Avg: 0.1246
+Saving: True | Max: 0.1417 | Last: 0.1417 | Avg: 0.1236
+Saving: True | Max: 0.1405 | Last: 0.1405 | Avg: 0.1234
+Saving: True | Max: 0.1534 | Last: 0.1534 | Avg: 0.1232
+Saving: True | Max: 0.1559 | Last: 0.1559 | Avg: 0.1234
+Saving: True | Max: 0.1231 | Last: 0.1104 | Avg: 0.1231
+Saving: True | Max: 0.1229 | Last: 0.1139 | Avg: 0.1229
+Saving: True | Max: 0.1228 | Last: 0.1152 | Avg: 0.1228
+Saving: True | Max: 0.1485 | Last: 0.1485 | Avg: 0.1228
+Saving: True | Max: 0.1231 | Last: 0.0736 | Avg: 0.1231
+Saving: True | Max: 0.1232 | Last: 0.1016 | Avg: 0.1232
+Saving: True | Max: 0.1519 | Last: 0.1519 | Avg: 0.1234
+Saving: True | Max: 0.1233 | Last: 0.1096 | Avg: 0.1233
+Saving: True | Max: 0.1232 | Last: 0.1051 | Avg: 0.1232
+Saving: True | Max: 0.1234 | Last: 0.1173 | Avg: 0.1234
+Saving: True | Max: 0.1233 | Last: 0.1168 | Avg: 0.1233
+Saving: True | Max: 0.1309 | Last: 0.1309 | Avg: 0.1229
+Saving: True | Max: 0.1432 | Last: 0.1432 | Avg: 0.1227
+Saving: True | Max: 0.1226 | Last: 0.1211 | Avg: 0.1226
+Saving: True | Max: 0.1227 | Last: 0.1227 | Avg: 0.1221
+Saving: True | Max: 0.1219 | Last: 0.1029 | Avg: 0.1219
+Saving: True | Max: 0.1217 | Last: 0.1058 | Avg: 0.1217
+Saving: True | Max: 0.1218 | Last: 0.1206 | Avg: 0.1218
+Saving: True | Max: 0.1379 | Last: 0.1379 | Avg: 0.1221
+Saving: True | Max: 0.1228 | Last: 0.1012 | Avg: 0.1228
+Saving: True | Max: 0.1226 | Last: 0.1121 | Avg: 0.1226
+Saving: True | Max: 0.1226 | Last: 0.0930 | Avg: 0.1226
+Saving: False | Max: 0.1564 | Last: 0.1564 | Avg: 0.1230
+Saving: True | Max: 0.1266 | Last: 0.1266 | Avg: 0.1234
+Saving: True | Max: 0.1234 | Last: 0.1050 | Avg: 0.1234
+Saving: True | Max: 0.1235 | Last: 0.1031 | Avg: 0.1235
+Saving: True | Max: 0.1235 | Last: 0.0956 | Avg: 0.1235
+Saving: True | Max: 0.1233 | Last: 0.1117 | Avg: 0.1233
+Saving: False | Max: 0.1559 | Last: 0.1559 | Avg: 0.1229
+Saving: True | Max: 0.1532 | Last: 0.1532 | Avg: 0.1234
+Saving: True | Max: 0.1248 | Last: 0.1248 | Avg: 0.1231
+Saving: True | Max: 0.1445 | Last: 0.1445 | Avg: 0.1228
+Saving: True | Max: 0.1514 | Last: 0.1514 | Avg: 0.1229
+Saving: True | Max: 0.1225 | Last: 0.1021 | Avg: 0.1225
+Saving: True | Max: 0.1317 | Last: 0.1317 | Avg: 0.1221
+Saving: True | Max: 0.1220 | Last: 0.1002 | Avg: 0.1220
+Saving: True | Max: 0.1321 | Last: 0.1321 | Avg: 0.1221
+Saving: True | Max: 0.1260 | Last: 0.1260 | Avg: 0.1218
+Saving: True | Max: 0.1212 | Last: 0.1191 | Avg: 0.1212
+Saving: True | Max: 0.1213 | Last: 0.1155 | Avg: 0.1213
+Saving: True | Max: 0.1212 | Last: 0.1138 | Avg: 0.1212
+Saving: True | Max: 0.1213 | Last: 0.1184 | Avg: 0.1213
+Saving: True | Max: 0.1371 | Last: 0.1371 | Avg: 0.1215

wandb/run-20260428_171645-wt40fdyx/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,117 @@

+cuda-toolkit==13.0.2
+typing_extensions==4.15.0
+nvidia-nvjitlink==13.0.88
+MarkupSafe==3.0.3
+nvidia-cufile==1.15.1.6
+certifi==2026.4.22
+nvidia-cusolver==12.0.4.66
+nvidia-curand==10.4.0.35
+Jinja2==3.1.6
+nvidia-nvtx==13.0.85
+nvidia-cuda-cupti==13.0.85
+torchaudio==2.11.0+cu130
+safetensors==0.7.0
+nvidia-cuda-runtime==13.0.96
+torchvision==0.26.0+cu130
+nvidia-cufft==12.0.0.61
+nvidia-cusparse==12.6.3.3
+nvidia-cuda-nvrtc==13.0.88
+fsspec==2026.2.0
+nvidia-cusparselt-cu13==0.8.0
+nvidia-nccl-cu13==2.28.9
+nvidia-nvshmem-cu13==3.4.5
+nvidia-cublas==13.1.0.3
+nvidia-cudnn-cu13==9.19.0.56
+mpmath==1.3.0
+triton==3.6.0
+networkx==3.6.1
+sympy==1.14.0
+torch==2.11.0+cu130
+hf_transfer==0.1.9
+six==1.17.0
+typer==0.25.0
+typing-inspection==0.4.2
+muon-adamw8bit==0.5.0
+aiosignal==1.4.0
+wurlitzer==3.1.1
+semantic-version==2.10.0
+aiohappyeyeballs==2.6.1
+cycler==0.12.1
+tokenizers==0.22.2
+annotated-doc==0.0.4
+rpds-py==0.30.0
+configobj==5.0.9
+regex==2026.4.4
+zipp==3.23.1
+annotated-types==0.7.0
+everett==3.1.0
+pydantic_core==2.46.3
+mdurl==0.1.2
+platformdirs==4.9.6
+idna==3.13
+psutil==7.2.2
+xxhash==3.7.0
+smmap==5.0.3
+frozenlist==1.8.0
+multidict==6.7.1
+shellingham==1.5.4
+kiwisolver==1.5.0
+propcache==0.4.1
+h11==0.16.0
+hf-xet==1.4.3
+pyparsing==3.3.2
+yarl==1.23.0
+importlib_metadata==9.0.0
+referencing==0.37.0
+requests==2.33.1
+filelock==3.29.0
+charset-normalizer==3.4.7
+wrapt==2.1.2
+contourpy==1.3.3
+python-box==6.1.0
+python-dateutil==2.9.0.post0
+packaging==26.2
+httpx==0.28.1
+PyYAML==6.0.3
+click==8.3.3
+jsonschema-specifications==2025.9.1
+gitdb==4.0.12
+einops==0.8.2
+attrs==26.1.0
+httpcore==1.0.9
+cuda-pathfinder==1.5.4
+requests-toolbelt==1.0.0
+GitPython==3.1.48
+jsonschema==4.26.0
+tqdm==4.67.3
+urllib3==2.6.3
+anyio==4.13.0
+simplejson==4.1.1
+multiprocess==0.70.19
+dill==0.4.1
+protobuf==7.34.1
+markdown-it-py==4.0.0
+bitsandbytes==0.49.2
+cuda-bindings==13.2.0
+aiohttp==3.13.5
+accelerate==1.13.0
+dulwich==0.25.2
+pydantic==2.13.3
+datasets==4.8.5
+rich==15.0.0
+flash-linear-attention==0.5.0
+pillow==12.2.0
+huggingface_hub==1.12.0
+sentry-sdk==2.58.0
+fla-core==0.5.0
+Pygments==2.20.0
+diffusers==0.37.1
+fonttools==4.62.1
+comet_ml==3.57.3
+setuptools==81.0.0
+matplotlib==3.10.9
+pyarrow==24.0.0
+wandb==0.26.1
+numpy==2.4.4
+pandas==3.0.2
+transformers==5.6.2

wandb/run-20260428_171645-wt40fdyx/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "os":  "Linux-6.8.0-110-generic-x86_64-with-glibc2.39",
+  "python":  "CPython 3.12.3",
+  "startedAt":  "2026-04-28T17:16:45.135482Z",
+  "args":  [
+    "--batch",
+    "16",
+    "--lvl",
+    "1"
+  ],
+  "program":  "/root/sdxs-2b/train.py",
+  "codePath":  "train.py",
+  "codePathLocal":  "train.py",
+  "git":  {
+    "remote":  "https://huggingface.co/AiArtLab/sdxs-2b",
+    "commit":  "ab8719f79299a6e86448b407298689048767b261"
+  },
+  "email":  "vadim-kulibaba@yandex.ru",
+  "root":  "/root/sdxs-2b",
+  "host":  "O-1649582",
+  "executable":  "/root/.venv/bin/python3",
+  "cpu_count":  48,
+  "cpu_count_logical":  96,
+  "gpu":  "NVIDIA GeForce RTX 5090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "888178696192",
+      "used":  "598432870400"
+    }
+  },
+  "memory":  {
+    "total":  "134889213952"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-af06c899-cefd-2303-137f-17f69c648771"
+    }
+  ],
+  "cudaVersion":  "13.0",
+  "writerId":  "9ndk10qtzdsvighcagxlxbtug93n98at"
+}

wandb/run-20260428_171645-wt40fdyx/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,7 @@

+{"time":"2026-04-28T17:16:45.179418861Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpvlugdolt/port-14112.txt","pid":14112,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-04-28T17:16:45.18135139Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":14112}
+{"time":"2026-04-28T17:16:45.181241106Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-14112-14129-3488405791/socket","Net":"unix"}}
+{"time":"2026-04-28T17:16:45.343101118Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-04-28T17:16:45.350678398Z","level":"INFO","msg":"handleInformInit: received","streamId":"wt40fdyx","id":"1(@)"}
+{"time":"2026-04-28T17:16:45.730308466Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"wt40fdyx","id":"1(@)"}
+{"time":"2026-04-28T17:16:54.001250093Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"lrv3btfsqddl"}

wandb/run-20260428_171645-wt40fdyx/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20260428_171645-wt40fdyx/logs/debug.log ADDED Viewed

	@@ -0,0 +1,19 @@

+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_setup.py:_flush():81] Current SDK version is 0.26.1
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_setup.py:_flush():81] Configure stats pid to 14112
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:setup_run_log_directory():723] Logging user logs to /root/sdxs-2b/wandb/run-20260428_171645-wt40fdyx/logs/debug.log
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:setup_run_log_directory():724] Logging internal logs to /root/sdxs-2b/wandb/run-20260428_171645-wt40fdyx/logs/debug-internal.log
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:init():850] calling init triggers
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:init():855] wandb.init called with sweep_config: {}
+config: {'batch_size': 16, 'base_learning_rate': 1.3333333333333335e-05, 'num_epochs': 1, 'optimizer_type': 'adafactor', '_wandb': {}}
+2026-04-28 17:16:45,138 INFO    MainThread:14112 [wandb_init.py:init():898] starting backend
+2026-04-28 17:16:45,343 INFO    MainThread:14112 [wandb_init.py:init():913] sending inform_init request
+2026-04-28 17:16:45,731 INFO    MainThread:14112 [wandb_init.py:init():918] backend started and connected
+2026-04-28 17:16:45,734 INFO    MainThread:14112 [wandb_init.py:init():988] updated telemetry
+2026-04-28 17:16:45,742 INFO    MainThread:14112 [wandb_init.py:init():1011] communicating run to backend with 90.0 second timeout
+2026-04-28 17:16:46,973 INFO    MainThread:14112 [wandb_init.py:init():1056] starting run threads in backend
+2026-04-28 17:16:47,099 INFO    MainThread:14112 [wandb_run.py:_console_start():2554] atexit reg
+2026-04-28 17:16:47,099 INFO    MainThread:14112 [wandb_run.py:_redirect():2403] redirect: wrap_raw
+2026-04-28 17:16:47,100 INFO    MainThread:14112 [wandb_run.py:_redirect():2472] Wrapping output streams.
+2026-04-28 17:16:47,100 INFO    MainThread:14112 [wandb_run.py:_redirect():2495] Redirects installed.
+2026-04-28 17:16:47,104 INFO    MainThread:14112 [wandb_init.py:init():1094] run started, returning control to user process

wandb/run-20260428_171645-wt40fdyx/run-wt40fdyx.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84f6382ddf7402e5b98379478d7897d5a678f939eba1a8a5d028a988674120a5
+size 15499264