init

Browse files

Files changed (16) hide show

.gitattributes +5 -7
.gitignore +0 -8
dataset-sdxs2b-1152.py +300 -0
dataset-sdxs2b-640.py +285 -0
dataset_sample.ipynb +3 -170
pipeline_sdxs.py +106 -207
refined.jpg +3 -0
scheduler/.ipynb_checkpoints/scheduler_config-checkpoint.json +0 -22
scheduler/scheduler_config.json +3 -22
test.ipynb +2 -2
train-sdxs2b.py +33 -10
transformer/diffusion_pytorch_model.safetensors +2 -2
wandb/debug-internal.log +0 -0
wandb/debug-internal.log +1 -0
wandb/debug.log +0 -19
wandb/debug.log +1 -0

.gitattributes CHANGED Viewed

@@ -33,10 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-media/refined.jpg filter=lfs diff=lfs merge=lfs -text
-test.ipynb filter=lfs diff=lfs merge=lfs -text
-wandb/run-20260428_171645-wt40fdyx/run-wt40fdyx.wandb filter=lfs diff=lfs merge=lfs -text
-wandb/run-20260502_205213-nj3nqkga/run-nj3nqkga.wandb filter=lfs diff=lfs merge=lfs -text
-wandb/run-20260504_065935-dzvbyo3j/run-dzvbyo3j.wandb filter=lfs diff=lfs merge=lfs -text
-wandb/run-20260505_075313-ti70f47q/run-ti70f47q.wandb filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ipynb filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+media/refined.webp filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -11,11 +11,3 @@ datasets
 test
 wandb
 nohup.out
-samples/
-transformer/
-*.jpg
-*.png
-datasets/
-samples/
-*.jpg
-train.py

 test
 wandb
 nohup.out

dataset-sdxs2b-1152.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# pip install flash-attn --no-build-isolation
+import torch
+import os
+import gc
+import numpy as np
+import random
+import json
+import shutil
+import time
+from datasets import Dataset, load_from_disk, concatenate_datasets
+from diffusers import AutoencoderKLQwenImage
+from torchvision.transforms import Resize, ToTensor, Normalize, Compose, InterpolationMode, Lambda
+from transformers import AutoModel, AutoImageProcessor, AutoTokenizer, AutoModelForCausalLM
+from typing import Dict, List, Tuple, Optional, Any
+from PIL import Image
+from tqdm import tqdm
+from datetime import timedelta
+from accelerate import Accelerator
+accelerator = Accelerator()
+device = accelerator.device
+is_main_process = accelerator.is_main_process
+process_index = accelerator.process_index
+num_processes = accelerator.num_processes
+# ---------------- 1️⃣ Настройки ----------------
+dtype = torch.float16
+batch_size = 5
+min_size = 576
+max_size = 1152
+step = 64
+empty_share = 0.0
+limit = 0
+folder_path = "/root/dataset"
+save_path = "/root/ds1234_1152_vae_qwen"
+os.makedirs(save_path, exist_ok=True)
+def clear_cuda_memory():
+    if torch.cuda.is_available():
+        used_gb = torch.cuda.max_memory_allocated() / 1024**3
+        print(f"[GPU {process_index}] used_gb: {used_gb:.2f} GB")
+        torch.cuda.empty_cache()
+        gc.collect()
+# ---------------- 2️⃣ Загрузка моделей ----------------
+def load_models():
+    print(f"[GPU {process_index}] Загрузка моделей...")
+    vae = AutoencoderKLQwenImage.from_pretrained("vae", torch_dtype=dtype).to(device).eval()
+    return vae
+vae = load_models()
+shift_factor = getattr(vae.config, "shift_factor", 0.0) or 0.0
+scaling_factor = getattr(vae.config, "scaling_factor", 1.0) or 1.0
+mean = getattr(vae.config, "latents_mean", None)
+std = getattr(vae.config, "latents_std", None)
+if mean is not None and std is not None:
+    latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1, 1)
+    latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1, 1)
+# ---------------- 3️⃣ Трансформации ----------------
+def get_image_transform(min_size=256, max_size=512, step=64):
+    def transform(img, dry_run=False):
+        original_width, original_height = img.size
+        if original_width >= original_height:
+            new_width = max_size
+            new_height = int(max_size * original_height / original_width)
+        else:
+            new_height = max_size
+            new_width = int(max_size * original_width / original_height)
+        if new_height < min_size or new_width < min_size:
+            if original_width <= original_height:
+                new_width = min_size
+                new_height = int(min_size * original_height / original_width)
+            else:
+                new_height = min_size
+                new_width = int(min_size * original_width / original_height)
+        crop_width = min(max_size, (new_width // step) * step)
+        crop_height = min(max_size, (new_height // step) * step)
+        crop_width = max(min_size, crop_width)
+        crop_height = max(min_size, crop_height)
+        if dry_run:
+            return crop_width, crop_height
+        img_resized = img.convert("RGB").resize((new_width, new_height), Image.LANCZOS)
+        top = (new_height - crop_height) // 3
+        left = 0
+        img_cropped = img_resized.crop((left, top, left + crop_width, top + crop_height))
+        final_width, final_height = img_cropped.size
+        img_tensor = ToTensor()(img_cropped)
+        img_tensor = Normalize(mean=[0.5]*3, std=[0.5]*3)(img_tensor)
+        return img_tensor, img_cropped, final_width, final_height
+    return transform
+# ---------------- 4️⃣ Функции обработки ----------------
+def clean_label(label):
+    label = label.replace("Image 1","").replace("Image 2","").replace("Image 3","").replace("Image 4","")
+    label = label.replace("The image depicts ","").replace("The image presents ","")
+    label = label.replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
+    if label.startswith("."):
+        label = label[1:].lstrip()
+    return label
+def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
+    labels_for_model = []
+    labels_for_logging = []
+    for label in original_labels:
+        if random.random() < prob_to_make_empty:
+            labels_for_model.append("")
+            labels_for_logging.append(f"zero: {label}")
+        else:
+            labels_for_model.append(label)
+            labels_for_logging.append(label)
+    return labels_for_model, labels_for_logging
+def encode_to_latents(images, texts):
+    transform = get_image_transform(min_size, max_size, step)
+    transformed_tensors = []
+    widths, heights = [], []
+    for img in images:
+        try:
+            t_img, _, w, h = transform(img)
+            transformed_tensors.append(t_img)
+            widths.append(w)
+            heights.append(h)
+        except Exception as e:
+            print(f"Ошибка трансформации: {e}")
+    if not transformed_tensors:
+        return None
+    batch_tensor = torch.stack(transformed_tensors).to(device, dtype)
+    if batch_tensor.ndim==4:
+        batch_tensor = batch_tensor.unsqueeze(2)
+    with torch.no_grad():
+        posteriors = vae.encode(batch_tensor).latent_dist.mode()
+        if mean is not None and std is not None:
+            posteriors = (posteriors - latents_mean) / latents_std
+        posteriors = (posteriors - shift_factor) / scaling_factor
+    #latents_np = posteriors.cpu().numpy()
+    latents_np = posteriors.squeeze(2).cpu().numpy()
+    text_labels = [clean_label(text) for text in texts]
+    _, text_labels = process_labels_for_guidance(text_labels, empty_share)
+    return {
+        "vae": latents_np,
+        "text": text_labels,
+        "width": widths,
+        "height": heights
+    }
+# ---------------- 5️⃣ Обработка папки ----------------
+def process_folder(folder_path, limit=None):
+    image_paths, text_paths, width, height = [], [], [], []
+    transform = get_image_transform(min_size, max_size, step)
+    for root, _, files in os.walk(folder_path):
+        for filename in files:
+            if filename.lower().endswith((".jpg",".jpeg",".png",".webp")):
+                image_path = os.path.join(root, filename)
+                try:
+                    img = Image.open(image_path)
+                except:
+                    continue
+                w,h = transform(img, dry_run=True)
+                text_path = os.path.splitext(image_path)[0]+".txt"
+                if os.path.exists(text_path):
+                    image_paths.append(image_path)
+                    text_paths.append(text_path)
+                    width.append(w)
+                    height.append(h)
+    print(f"Найдено {len(image_paths)} изображений")
+    return image_paths, text_paths, width, height
+def process_in_chunks(image_paths, text_paths, width, height, chunk_size=5000, batch_size=1):
+    total_files = len(image_paths)
+    start_time = time.time()
+    for chunk_idx, start in enumerate(range(0,total_files,chunk_size),1):
+        end = min(start+chunk_size,total_files)
+        chunk_image_paths = image_paths[start:end]
+        chunk_text_paths = text_paths[start:end]
+        chunk_widths = width[start:end]
+        chunk_heights = height[start:end]
+        chunk_texts = []
+        for text_path in chunk_text_paths:
+            try:
+                with open(text_path,'r',encoding='utf-8') as f:
+                    chunk_texts.append(f.read().strip())
+            except:
+                chunk_texts.append("")
+        size_groups = {}
+        for i in range(len(chunk_image_paths)):
+            key=(chunk_widths[i],chunk_heights[i])
+            size_groups.setdefault(key,{"image_paths":[],"texts":[]})
+            size_groups[key]["image_paths"].append(chunk_image_paths[i])
+            size_groups[key]["texts"].append(chunk_texts[i])
+        for size_key,group_data in size_groups.items():
+            group_dataset = Dataset.from_dict(group_data)
+            processed_group = group_dataset.map(
+                lambda ex: encode_to_latents(
+                    [Image.open(p) for p in ex["image_paths"]],
+                    #[Image.open(p).convert("RGB") for p in ex["image_paths"]], # <--- Добавил .convert("RGB"), чтобы картинка загрузилась в память
+                    ex["texts"]
+                ),
+                batched=True,
+                batch_size=batch_size,
+            )
+            # --- NEW: уникальный путь ---
+            group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_{size_key[0]}x{size_key[1]}_proc_{process_index}_"
+            # --- END NEW ---
+            processed_group.save_to_disk(group_save_path)
+            clear_cuda_memory()
+# ---------------- 7️⃣ Объединение ----------------
+def combine_chunks(temp_path, final_path):
+    chunks = sorted([
+        os.path.join(temp_path,d)
+        for d in os.listdir(temp_path)
+        if "chunk_" in d
+    ])
+    datasets = [load_from_disk(c) for c in chunks]
+    combined = concatenate_datasets(datasets)
+    combined.save_to_disk(final_path)
+    print("✅ Сохранено")
+# ---------------- MAIN ----------------
+temp_path = f"{save_path}_temp"
+os.makedirs(temp_path, exist_ok=True)
+image_paths, text_paths, width, height = process_folder(folder_path,limit)
+# сортировка
+sorted_indices = sorted(range(len(width)), key=lambda i:(width[i],height[i]))
+image_paths = [image_paths[i] for i in sorted_indices]
+text_paths = [text_paths[i] for i in sorted_indices]
+width = [width[i] for i in sorted_indices]
+height = [height[i] for i in sorted_indices]
+# --- shard по GPU ---
+indices = list(range(len(image_paths)))
+indices = indices[process_index::num_processes]
+image_paths = [image_paths[i] for i in indices]
+text_paths = [text_paths[i] for i in indices]
+width = [width[i] for i in indices]
+height = [height[i] for i in indices]
+print(f"[GPU {process_index}] обрабатывает {len(image_paths)} файлов")
+process_in_chunks(image_paths, text_paths, width, height, chunk_size=1000, batch_size=batch_size)
+accelerator.wait_for_everyone()
+# --- NEW: только главный процесс ---
+if is_main_process:
+    try:
+        shutil.rmtree(folder_path)
+    except:
+        pass
+    combine_chunks(temp_path, save_path)
+    try:
+        shutil.rmtree(temp_path)
+    except:
+        pass

dataset-sdxs2b-640.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# pip install flash-attn --no-build-isolation
+import torch
+import os
+import gc
+import numpy as np
+import random
+import json
+import shutil
+import time
+from datasets import Dataset, load_from_disk, concatenate_datasets
+from diffusers import AutoencoderKLQwenImage
+from torchvision.transforms import Resize, ToTensor, Normalize, Compose, InterpolationMode, Lambda
+from transformers import AutoModel, AutoImageProcessor, AutoTokenizer, AutoModelForCausalLM
+from typing import Dict, List, Tuple, Optional, Any
+from PIL import Image
+from tqdm import tqdm
+from datetime import timedelta
+from accelerate import Accelerator
+accelerator = Accelerator()
+device = accelerator.device
+is_main_process = accelerator.is_main_process
+process_index = accelerator.process_index
+num_processes = accelerator.num_processes
+# ---------------- 1️⃣ Настройки ----------------
+dtype = torch.float16
+batch_size = 5
+min_size = 320
+max_size = 640
+step = 64
+empty_share = 0.0
+limit = 0
+folder_path = "/root/datasets/butterfly"
+save_path = "datasets/dsb_640_vae_qwen"
+os.makedirs(save_path, exist_ok=True)
+def clear_cuda_memory():
+    if torch.cuda.is_available():
+        used_gb = torch.cuda.max_memory_allocated() / 1024**3
+        print(f"[GPU {process_index}] used_gb: {used_gb:.2f} GB")
+        torch.cuda.empty_cache()
+        gc.collect()
+# ---------------- 2️⃣ Загрузка моделей ----------------
+def load_models():
+    print(f"[GPU {process_index}] Загрузка моделей...")
+    vae = AutoencoderKLQwenImage.from_pretrained("vae", torch_dtype=dtype).to(device).eval()
+    return vae
+vae = load_models()
+shift_factor = getattr(vae.config, "shift_factor", 0.0) or 0.0
+scaling_factor = getattr(vae.config, "scaling_factor", 1.0) or 1.0
+mean = getattr(vae.config, "latents_mean", None)
+std = getattr(vae.config, "latents_std", None)
+if mean is not None and std is not None:
+    latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1, 1)
+    latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1, 1)
+# ---------------- 3️⃣ Трансформации ----------------
+def get_image_transform(min_size=256, max_size=512, step=64):
+    def transform(img, dry_run=False):
+        original_width, original_height = img.size
+        if original_width >= original_height:
+            new_width = max_size
+            new_height = int(max_size * original_height / original_width)
+        else:
+            new_height = max_size
+            new_width = int(max_size * original_width / original_height)
+        if new_height < min_size or new_width < min_size:
+            if original_width <= original_height:
+                new_width = min_size
+                new_height = int(min_size * original_height / original_width)
+            else:
+                new_height = min_size
+                new_width = int(min_size * original_width / original_height)
+        crop_width = min(max_size, (new_width // step) * step)
+        crop_height = min(max_size, (new_height // step) * step)
+        crop_width = max(min_size, crop_width)
+        crop_height = max(min_size, crop_height)
+        if dry_run:
+            return crop_width, crop_height
+        img_resized = img.convert("RGB").resize((new_width, new_height), Image.LANCZOS)
+        top = (new_height - crop_height) // 3
+        left = 0
+        img_cropped = img_resized.crop((left, top, left + crop_width, top + crop_height))
+        final_width, final_height = img_cropped.size
+        img_tensor = ToTensor()(img_cropped)
+        img_tensor = Normalize(mean=[0.5]*3, std=[0.5]*3)(img_tensor)
+        return img_tensor, img_cropped, final_width, final_height
+    return transform
+# ---------------- 4️⃣ Функции обработки ----------------
+def clean_label(label):
+    label = label.replace("Image 1","").replace("Image 2","").replace("Image 3","").replace("Image 4","")
+    label = label.replace("The image depicts ","").replace("The image presents ","")
+    label = label.replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
+    if label.startswith("."):
+        label = label[1:].lstrip()
+    return label
+def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
+    labels_for_model = []
+    labels_for_logging = []
+    for label in original_labels:
+        if random.random() < prob_to_make_empty:
+            labels_for_model.append("")
+            labels_for_logging.append(f"zero: {label}")
+        else:
+            labels_for_model.append(label)
+            labels_for_logging.append(label)
+    return labels_for_model, labels_for_logging
+def encode_to_latents(images, texts):
+    transform = get_image_transform(min_size, max_size, step)
+    transformed_tensors = []
+    widths, heights = [], []
+    for img in images:
+        try:
+            t_img, _, w, h = transform(img)
+            transformed_tensors.append(t_img)
+            widths.append(w)
+            heights.append(h)
+        except Exception as e:
+            print(f"Ошибка трансформации: {e}")
+    if not transformed_tensors:
+        return None
+    batch_tensor = torch.stack(transformed_tensors).to(device, dtype)
+    if batch_tensor.ndim==4:
+        batch_tensor = batch_tensor.unsqueeze(2)
+    with torch.no_grad():
+        posteriors = vae.encode(batch_tensor).latent_dist.mode()
+        if mean is not None and std is not None:
+            posteriors = (posteriors - latents_mean) / latents_std
+        posteriors = (posteriors - shift_factor) / scaling_factor
+    #latents_np = posteriors.cpu().numpy()
+    latents_np = posteriors.squeeze(2).cpu().numpy()
+    text_labels = [clean_label(text) for text in texts]
+    _, text_labels = process_labels_for_guidance(text_labels, empty_share)
+    return {
+        "vae": latents_np,
+        "text": text_labels,
+        "width": widths,
+        "height": heights
+    }
+# ---------------- 5️⃣ Обработка папки ----------------
+def process_folder(folder_path, limit=None):
+    image_paths, text_paths, width, height = [], [], [], []
+    transform = get_image_transform(min_size, max_size, step)
+    for root, _, files in os.walk(folder_path):
+        for filename in files:
+            if filename.lower().endswith((".jpg",".jpeg",".png",".webp")):
+                image_path = os.path.join(root, filename)
+                try:
+                    img = Image.open(image_path)
+                except:
+                    continue
+                w,h = transform(img, dry_run=True)
+                text_path = os.path.splitext(image_path)[0]+".txt"
+                if os.path.exists(text_path):
+                    image_paths.append(image_path)
+                    text_paths.append(text_path)
+                    width.append(w)
+                    height.append(h)
+    print(f"Найдено {len(image_paths)} изображений")
+    return image_paths, text_paths, width, height
+def process_in_chunks(image_paths, text_paths, width, height, chunk_size=5000, batch_size=1):
+    total_files = len(image_paths)
+    start_time = time.time()
+    for chunk_idx, start in enumerate(range(0,total_files,chunk_size),1):
+        end = min(start+chunk_size,total_files)
+        chunk_image_paths = image_paths[start:end]
+        chunk_text_paths = text_paths[start:end]
+        chunk_widths = width[start:end]
+        chunk_heights = height[start:end]
+        chunk_texts = []
+        for text_path in chunk_text_paths:
+            try:
+                with open(text_path,'r',encoding='utf-8') as f:
+                    chunk_texts.append(f.read().strip())
+            except:
+                chunk_texts.append("")
+        size_groups = {}
+        for i in range(len(chunk_image_paths)):
+            key=(chunk_widths[i],chunk_heights[i])
+            size_groups.setdefault(key,{"image_paths":[],"texts":[]})
+            size_groups[key]["image_paths"].append(chunk_image_paths[i])
+            size_groups[key]["texts"].append(chunk_texts[i])
+        for size_key,group_data in size_groups.items():
+            group_dataset = Dataset.from_dict(group_data)
+            processed_group = group_dataset.map(
+                lambda ex: encode_to_latents(
+                    [Image.open(p) for p in ex["image_paths"]],
+                    #[Image.open(p).convert("RGB") for p in ex["image_paths"]], # <--- Добавил .convert("RGB"), чтобы картинка загрузилась в память
+                    ex["texts"]
+                ),
+                batched=True,
+                batch_size=batch_size,
+            )
+            # --- NEW: уникальный путь ---
+            group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_{size_key[0]}x{size_key[1]}_proc_{process_index}_"
+            # --- END NEW ---
+            processed_group.save_to_disk(group_save_path)
+            clear_cuda_memory()
+# ---------------- 7️⃣ Объединение ----------------
+def combine_chunks(temp_path, final_path):
+    chunks = sorted([
+        os.path.join(temp_path,d)
+        for d in os.listdir(temp_path)
+        if "chunk_" in d
+    ])
+    datasets = [load_from_disk(c) for c in chunks]
+    combined = concatenate_datasets(datasets)
+    combined.save_to_disk(final_path)
+    print("✅ Сохранено")
+# ---------------- MAIN ----------------
+temp_path = f"{save_path}_temp"
+os.makedirs(temp_path, exist_ok=True)
+image_paths, text_paths, width, height = process_folder(folder_path,limit)
+# сортировка
+sorted_indices = sorted(range(len(width)), key=lambda i:(width[i],height[i]))
+image_paths = [image_paths[i] for i in sorted_indices]
+text_paths = [text_paths[i] for i in sorted_indices]
+width = [width[i] for i in sorted_indices]
+height = [height[i] for i in sorted_indices]
+# --- shard по GPU ---
+indices = list(range(len(image_paths)))
+indices = indices[process_index::num_processes]
+image_paths = [image_paths[i] for i in indices]
+text_paths = [text_paths[i] for i in indices]
+width = [width[i] for i in indices]
+height = [height[i] for i in indices]
+print(f"[GPU {process_index}] обрабатывает {len(image_paths)} файлов")
+process_in_chunks(image_paths, text_paths, width, height, chunk_size=1000, batch_size=batch_size)

dataset_sample.ipynb CHANGED Viewed

@@ -1,170 +1,3 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "9c312df2-cb57-44f6-af54-3af6ab8f962f",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'numpy'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#from datasets import load_from_disk\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mPIL\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Image\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
-     ]
-    }
-   ],
-   "source": [
-    "from datasets import load_from_disk\n",
-    "import numpy as np\n",
-    "import torch\n",
-    "from PIL import Image\n",
-    "from collections import defaultdict\n",
-    "from diffusers import AutoencoderKLQwenImage\n",
-    "import gc\n",
-    "\n",
-    "def analyze_dataset_by_size(dataset_path):\n",
-    "    \"\"\"\n",
-    "    Группирует датасет по размерам изображений и выводит базовую информацию.\n",
-    "    \"\"\"\n",
-    "    # Настройка устройства и типа данных\n",
-    "    dtype = torch.float16\n",
-    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-    "    \n",
-    "    # Загрузка VAE модели\n",
-    "    print(\"Загрузка VAE модели...\")\n",
-    "    vae = AutoencoderKLQwenImage.from_pretrained(\"vae\",torch_dtype=dtype).to(device).eval()\n",
-    "    shift_factor = getattr(vae.config, \"shift_factor\", 0.0)\n",
-    "    if shift_factor is None:\n",
-    "        shift_factor = 0.0\n",
-    "    \n",
-    "    scaling_factor = getattr(vae.config, \"scaling_factor\", 1.0)\n",
-    "    if scaling_factor is None:\n",
-    "        scaling_factor = 1.0\n",
-    "        \n",
-    "    mean = getattr(vae.config, \"latents_mean\", None)\n",
-    "    std = getattr(vae.config, \"latents_std\", None)\n",
-    "    if mean is not None and std is not None:\n",
-    "        latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1)\n",
-    "        latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1)\n",
-    "    \n",
-    "    # Загружаем датасет\n",
-    "    print(f\"Загрузка датасета из {dataset_path}...\")\n",
-    "    dataset = load_from_disk(dataset_path)\n",
-    "\n",
-    "    print(f\"Осталось примеров после фильтрации: {len(dataset)}\")\n",
-    "    \n",
-    "    # Группируем примеры по размерам\n",
-    "    print(\"\\nГруппировка примеров по размерам...\")\n",
-    "    size_to_indices = defaultdict(list)\n",
-    "    \n",
-    "    # Собираем примеры с одинаковыми размерами\n",
-    "    # Собираем примеры с одинаковыми размерами (оптимизированная версия)\n",
-    "    widths = dataset[\"width\"]\n",
-    "    heights = dataset[\"height\"]\n",
-    "    for i, (w, h) in enumerate(zip(widths, heights)):\n",
-    "        size_to_indices[(w, h)].append(i)\n",
-    "    \n",
-    "    # Сортируем размеры по количеству примеров\n",
-    "    print(\"\\nСортируем...\")\n",
-    "    size_stats = [(size, len(indices)) for size, indices in size_to_indices.items()]\n",
-    "    size_stats.sort(key=lambda x: x[1], reverse=True)\n",
-    "    \n",
-    "    # Выводим информацию о каждой группе и показываем первый пример\n",
-    "    for size, count in size_stats:\n",
-    "        width, height = size\n",
-    "        first_idx = size_to_indices[size][1]\n",
-    "        example = dataset[first_idx]\n",
-    "        \n",
-    "        print(f\"\\n--- Батч {width}x{height}: {count} примеров ---\")\n",
-    "        \n",
-    "        # Декодируем латентное представление для первого примера\n",
-    "        latent = torch.tensor(example[\"vae\"], dtype=dtype).unsqueeze(0).to(device)\n",
-    "        \n",
-    "        # 1. Снова обманываем VAE, превращая картинку в \"видео из 1 кадра\" [B, C, 1, H, W]\n",
-    "        if latent.ndim == 4:\n",
-    "            latent = latent.unsqueeze(2)\n",
-    "        \n",
-    "        with torch.no_grad():\n",
-    "            if latents_mean is not None and latents_std is not None:\n",
-    "                latent = latent * latents_std + latents_mean\n",
-    "            \n",
-    "            print(f\"Min of latent_for_vae: {latent.min()}\")\n",
-    "            print(f\"Max of latent_for_vae: {latent.max()}\")\n",
-    "            print(f\"Mean of latent_for_vae: {latent.mean()}\")\n",
-    "            print(f\"Std: {latent.std().item():.4f}\")\n",
-    "            if torch.isnan(latent).any() or torch.isinf(latent).any():\n",
-    "                print(\"WARNING: Raw latents contain NaN or Inf values!\")\n",
-    "            \n",
-    "            reconstructed_image = vae.decode(latent).sample\n",
-    "        \n",
-    "        # 2. Вытаскиваем обычную 3D-картинку [C, H, W] из 5D-видеотензора\n",
-    "        if reconstructed_image.ndim == 5:\n",
-    "            # Берем нулевой батч, все каналы, нулевой кадр, всю высоту и ширину\n",
-    "            img_tensor = reconstructed_image[0, :, 0, :, :] \n",
-    "        else:\n",
-    "            img_tensor = reconstructed_image.squeeze(0) # На всякий случай, если VAE вернул 4D\n",
-    "        \n",
-    "        img_array = img_tensor.cpu().numpy()\n",
-    "        img_array = np.transpose(img_array, (1, 2, 0))\n",
-    "        img_array = (img_array + 1) / 2  # Нормализация к [0, 1]\n",
-    "        img_array = np.clip(img_array * 255, 0, 255).astype(np.uint8)  # Преобразуем в uint8 для PIL\n",
-    "        \n",
-    "        # Создаем PIL изображение из массива\n",
-    "        pil_image = Image.fromarray(img_array)\n",
-    "        print(f\"Текст: {example['text']}\")\n",
-    "        print(f\"Ключи: {', '.join(example.keys())}\")\n",
-    "        print(f\"latent: {latent.shape}\")\n",
-    "        pil_image.save(\"1.jpg\")\n",
-    "    \n",
-    "    # Очистка памяти\n",
-    "    if torch.cuda.is_available():\n",
-    "        torch.cuda.empty_cache()\n",
-    "        gc.collect()\n",
-    "    \n",
-    "    return size_to_indices  # Возвращаем словарь с индексами по группам\n",
-    "\n",
-    "# Использование\n",
-    "if __name__ == \"__main__\":\n",
-    "    # Путь к датасету\n",
-    "    save_path = \"datasets/ds234_640_vae_qwen\"\n",
-    "    \n",
-    "    # Анализ датасета\n",
-    "    size_groups = analyze_dataset_by_size(save_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "74a5d11d-369f-4f25-9ee0-31d3bccd0254",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:774dc5b6f2f55e8b4e925e5ba984f73b18e2c096b6c1df4bfe0075aa51a56258
+size 8190

pipeline_sdxs.py CHANGED Viewed

@@ -14,12 +14,10 @@ class SdxsPipelineOutput(BaseOutput):
     prompt: Optional[Union[str, List[str]]] = None
 class SdxsPipeline(DiffusionPipeline):
-    # Cosmos требует 512 токенов
-    MAX_TEXT_TOKENS = 512
     def __init__(self, vae, text_encoder, tokenizer, transformer, scheduler):
         super().__init__()
-        # Регистрируем модули (с Qwen)
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -28,62 +26,36 @@ class SdxsPipeline(DiffusionPipeline):
             scheduler=scheduler
         )
-        self.vae_scale_factor = getattr(self.vae.config, "spatial_compression_ratio", 8)
-        if hasattr(self.vae.config, "block_out_channels"):
-            self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        # Загружаем mean и std для VAE (Cosmos-style)
-        mean = getattr(self.vae.config, "latents_mean", None)
-        std = getattr(self.vae.config, "latents_std", None)
-        if mean is not None and std is not None:
-            self.vae_latents_mean = torch.tensor(mean).view(1, len(mean), 1, 1, 1)
-            # Внимание: Cosmos использует инвертированный std для декодирования (1.0 / std)
-            self.vae_latents_std =  torch.tensor(std).view(1, len(std), 1, 1, 1)
-        else:
-            self.vae_latents_mean = None
-            self.vae_latents_std = None
-        # Регистрируем параметры Cosmos в шедулере (если они еще не там)
-        if self.scheduler is not None:
-            self.scheduler.register_to_config(
-                sigma_max=getattr(self.scheduler.config, "sigma_max", 80.0),
-                sigma_min=getattr(self.scheduler.config, "sigma_min", 0.002),
-                sigma_data=getattr(self.scheduler.config, "sigma_data", 1.0),
-                final_sigmas_type=getattr(self.scheduler.config, "final_sigmas_type", "sigma_min"),
-            )
-    @staticmethod
-    def _pad_tensor_to_length(tensor: torch.Tensor, target_len: int, dim: int = 1, pad_value: float = 0) -> torch.Tensor:
-        current_len = tensor.shape[dim]
-        if current_len >= target_len:
-            return tensor
-        pad_size = target_len - current_len
-        if tensor.dim() == 3:
-            padding = (0, 0, 0, pad_size, 0, 0)
-        elif tensor.dim() == 2:
-            padding = (0, pad_size, 0, 0)
-        else:
-            raise ValueError(f"Unsupported tensor dimension: {tensor.dim()}")
-        return torch.nn.functional.pad(tensor, padding, value=pad_value)
-    @torch.no_grad()
     def refine_prompts(
         self,
         prompts: Union[str, List[str]],
         system_prompt: Optional[str] = None,
         temperature: float = 0.7
     ) -> List[str]:
-        """Refines a list of prompts using the Text Encoder (LLM)."""
         device = self.device
         if system_prompt is None:
             system_prompt = (
                 "You are a skilled text-to-image prompt engineer whose sole function is to transform "
-                "the user's input into an aesthetically optimized, detailed, and visually descriptive two-sentence output. "
-                "**The primary subject MUST be the main focus of the revised prompt "
-                "and MUST be described in rich detail within the first sentence.** "
                 "Output **only** the final revised prompt, with absolutely no commentary. "
-                "Don't use cliches like warm, soft, vibrant, wildflowers. Be creative. User input prompt: "
             )
         pad_id = getattr(self.text_encoder.config, "pad_token_id", None) or \
@@ -93,6 +65,7 @@ class SdxsPipeline(DiffusionPipeline):
         refined_list = []
         for p in prompts_list:
             full_text = system_prompt + p
             messages = [{"role": "user", "content": [{"type": "text", "text": full_text}]}]
@@ -120,7 +93,6 @@ class SdxsPipeline(DiffusionPipeline):
     @torch.no_grad()
     def encode_text(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Qwen-specific text encoding (using chat_template and hidden_states[-2])"""
         device = self.device
         dtype = self.transformer.dtype
         if text is None: text = ""
@@ -128,221 +100,148 @@ class SdxsPipeline(DiffusionPipeline):
         formatted_prompts = []
         for t in text:
             messages = [{"role": "user", "content": [{"type": "text", "text": t}]}]
             formatted_prompts.append(self.tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False))
-        toks = self.tokenizer(formatted_prompts, padding="max_length", max_length=self.MAX_TEXT_TOKENS, truncation=True, return_tensors="pt").to(device)
-        outputs = self.text_encoder(input_ids=toks.input_ids, attention_mask=toks.attention_mask, output_hidden_states=True)
-        # Берем предпоследний слой эмбеддингов, как того требуют современные пайплайны
-        last_hidden = outputs.hidden_states[-2]
-        return last_hidden.to(dtype=dtype), toks.attention_mask.to(dtype=torch.int64)
-    @torch.no_grad()
-    def image_upscale(self, image: Union[str, Image.Image, List[Union[str, Image.Image]]], batch_size: int = 1) -> List[Image.Image]:
-        images = [image] if isinstance(image, (str, Image.Image)) else image
-        batch_data = []
-        for img in images:
-            if isinstance(img, str): img = Image.open(img)
-            if img.mode == "RGBA":
-                img = Image.alpha_composite(Image.new("RGBA", img.size, (255, 255, 255)), img)
-            img = img.convert("RGB")
-            w, h = img.size
-            pw, ph = (8 - w % 8) % 8, (8 - h % 8) % 8
-            if pw or ph:
-                padded = Image.new("RGB", (w + pw, h + ph), (255, 255, 255))
-                padded.paste(img)
-                img = padded
-            t = torch.from_numpy(np.array(img).astype(np.float32) / 127.5 - 1.0).permute(2, 0, 1)
-            batch_data.append((t.to(self.device, torch.float16), w, h))
-        unique_shapes = {t.shape for t, _, _ in batch_data}
-        step = batch_size if len(unique_shapes) == 1 else 1
-        output_images = []
-        for i in range(0, len(batch_data), step):
-            chunk = batch_data[i : i + step]
-            tensors = torch.stack([c[0] for c in chunk]).unsqueeze(2)
-            latents = self.vae.encode(tensors).latent_dist.mean
-            decoded = self.vae.decode(latents.to(self.vae.dtype))[0]
-            if decoded.ndim == 5:
-                decoded = decoded.squeeze(2)
-            decoded = (decoded.clamp(-1, 1) + 1) / 2
-            for j, tensor in enumerate(decoded):
-                w, h = chunk[j][1], chunk[j][2]
-                arr = tensor.cpu().permute(1, 2, 0).float().numpy()
-                arr = arr[:h * 2, :w * 2]
-                output_images.append(Image.fromarray((arr * 255).astype("uint8")))
-        return output_images
     @torch.no_grad()
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_attention_mask: Optional[torch.Tensor] = None,
-        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
-        latents: Optional[torch.Tensor] = None,
-        height: int = 1024,
-        width: int = 1024,
         num_inference_steps: int = 40,
         guidance_scale: float = 4.0,
-        generator: Optional[torch.Generator] = None,
         seed: Optional[int] = None,
         output_type: str = "pil",
         return_dict: bool = True,
-        **kwargs,
     ):
         device = self.device
         dtype = self.transformer.dtype
-        if generator is None and seed is not None:
             generator = torch.Generator(device=device).manual_seed(seed)
         do_classifier_free_guidance = guidance_scale > 1.0
-        # 1. Encode Positive
-        if prompt_embeds is None:
-            if prompt is None: raise ValueError("`prompt` or `prompt_embeds` required.")
-            prompt_embeds, prompt_attention_mask = self.encode_text(prompt)
-        prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
-        prompt_attention_mask = prompt_attention_mask.to(device=device, dtype=torch.int64)
         batch_size = prompt_embeds.shape[0]
-        # 2. Encode Negative
         if do_classifier_free_guidance:
-            if negative_prompt_embeds is None:
-                neg_text = negative_prompt if negative_prompt is not None else ("" if isinstance(prompt, str) else [""] * len(prompt))
-                negative_prompt_embeds, negative_prompt_attention_mask = self.encode_text(neg_text)
-            negative_prompt_embeds = negative_prompt_embeds.to(device=device, dtype=dtype)
-            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device=device, dtype=torch.int64)
-            if negative_prompt_embeds.shape[0] != batch_size:
-                negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1)
-                negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(batch_size, 1)
-            max_len = max(prompt_embeds.shape[1], negative_prompt_embeds.shape[1])
-            prompt_embeds = self._pad_tensor_to_length(prompt_embeds, max_len, dim=1, pad_value=0)
-            negative_prompt_embeds = self._pad_tensor_to_length(negative_prompt_embeds, max_len, dim=1, pad_value=0)
-            prompt_attention_mask = self._pad_tensor_to_length(prompt_attention_mask, max_len, dim=1, pad_value=0)
-            negative_prompt_attention_mask = self._pad_tensor_to_length(negative_prompt_attention_mask, max_len, dim=1, pad_value=0)
-            text_embeddings = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
         else:
             text_embeddings = prompt_embeds
-        # 3. Prepare Timesteps (Cosmos specific schedule)
-        sigmas_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
-        sigmas = torch.linspace(0, 1, num_inference_steps, dtype=sigmas_dtype)
-        self.scheduler.set_timesteps(sigmas=sigmas, device=device)
-        timesteps = self.scheduler.timesteps
-        # Защита от деления на ноль на последнем шаге
-        if self.scheduler.config.get("final_sigmas_type", "zero") == "sigma_min":
-            self.scheduler.sigmas[-1] = self.scheduler.sigmas[-2]
-        if self.scheduler.sigmas[-1] == 0.0:
-            self.scheduler.sigmas[-1] = 1e-4
-        # 4. Prepare Latents (Noise)
         latent_h = height // self.vae_scale_factor
         latent_w = width // self.vae_scale_factor
         in_channels = self.transformer.config.in_channels
-        sigma_max = getattr(self.scheduler.config, "sigma_max", 80.0)
-        if latents is None:
-            # Создаем 5D тензор [Batch, Channels, Frames, Height, Width]
-            latents = torch.randn((batch_size, in_channels, 1, latent_h, latent_w), generator=generator, device=device, dtype=dtype)
-            latents = latents * sigma_max
-        else:
-            latents = latents.to(device=device, dtype=dtype) * sigma_max
-        # Cosmos Padding Mask
-        padding_mask = torch.zeros((1, 1, height, width), device=device, dtype=dtype)
-        # 5. Denoising Loop (Continuous Flow Math)
-        for i, t in enumerate(tqdm(timesteps, desc="Sampling")):
-            current_sigma = self.scheduler.sigmas[i]
-            # Защита от деления на 0 при вычислении current_t
-            if current_sigma == 0.0:
-                current_sigma = torch.tensor(1e-4, dtype=current_sigma.dtype, device=device)
-            current_t = current_sigma / (current_sigma + 1.0)
-            c_in = 1.0 - current_t
-            c_skip = 1.0 - current_t
-            c_out = -current_t
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = (latent_model_input * c_in).to(dtype)
-            # Трансформер ждет timestep в виде 1D тензора [B]
-            t_val = float(current_t.item()) if torch.is_tensor(current_t) else float(current_t)
-            timestep_tensor = torch.tensor(
-                [t_val],
-                device=device,
-                dtype=dtype
-            ).view(1, 1, 1, 1, 1).expand(latent_model_input.shape[0], 1, 1, 1, 1)
-            model_out = self.transformer(
                 hidden_states=latent_model_input,
-                timestep=timestep_tensor,
                 encoder_hidden_states=text_embeddings,
                 padding_mask=padding_mask,
                 return_dict=False,
             )[0]
-            batched_latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            noise_pred = (c_skip * batched_latents + c_out * model_out.float()).to(dtype)
             if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
-            noise_pred = (latents - noise_pred) / current_sigma
-            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-        # 6. Decode
         if output_type == "latent":
-            if not return_dict: return (latents, prompt)
             return SdxsPipelineOutput(images=latents)
-        if getattr(self.vae.config, "latents_std", None) is not None and getattr(self.vae.config, "latents_mean", None) is not None:
-            sigma_data = getattr(self.scheduler.config, "sigma_data", 1.0)
             l_mean = torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(device, dtype)
             l_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(device, dtype)
-            # Оригинальная формула: делим на инвертированный std (что равноценно умножению на std)
-            #latents_std_inv = 1.0 / l_std
             latents = latents * l_std + l_mean
-        image_output = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
         if image_output.ndim == 5:
-            image_output = image_output.squeeze(2)
         image_output = (image_output.clamp(-1, 1) + 1) / 2
         image_np = image_output.cpu().permute(0, 2, 3, 1).float().numpy()
-        # На всякий случай вычищаем NaNs
-        image_np = np.nan_to_num(image_np, nan=0.0, posinf=1.0, neginf=0.0)
         if output_type == "pil":
-            images = [(Image.fromarray((img * 255).round().astype("uint8"))) for img in image_np]
         else:
             images = image_np
-        if not return_dict:
-            return (images,)
-        return SdxsPipelineOutput(images=images)

     prompt: Optional[Union[str, List[str]]] = None
 class SdxsPipeline(DiffusionPipeline):
+    MAX_TEXT_TOKENS = 400 # не Соответствует max_length в обучении
     def __init__(self, vae, text_encoder, tokenizer, transformer, scheduler):
         super().__init__()
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
             scheduler=scheduler
         )
+        self.vae_scale_factor = 8
+    @torch.no_grad()
     def refine_prompts(
         self,
         prompts: Union[str, List[str]],
         system_prompt: Optional[str] = None,
         temperature: float = 0.7
     ) -> List[str]:
+        """
+        Refines a list of prompts using the Text Encoder (LLM).
+        Args:
+            prompts: Single prompt string or list of prompts.
+            system_prompt: Custom instruction for the LLM. If None, uses default aesthetic enhancer.
+            temperature: Sampling temperature for generation.
+        Returns:
+            List of refined prompts.
+        """
         device = self.device
+        # Default system prompt if none provided
         if system_prompt is None:
             system_prompt = (
                 "You are a skilled text-to-image prompt engineer whose sole function is to transform "
+                "the user's input into an aesthetic, detailed, and visually descriptive three-sentence output. "
                 "Output **only** the final revised prompt, with absolutely no commentary. "
+                "Don't use cliches like warm, soft, vibrant, wildflowers. User input prompt: "
             )
         pad_id = getattr(self.text_encoder.config, "pad_token_id", None) or \
         refined_list = []
         for p in prompts_list:
+            # Prepend system prompt to user input
             full_text = system_prompt + p
             messages = [{"role": "user", "content": [{"type": "text", "text": full_text}]}]
     @torch.no_grad()
     def encode_text(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
         device = self.device
         dtype = self.transformer.dtype
         if text is None: text = ""
         formatted_prompts = []
         for t in text:
+            # Повторяем логику чат-шаблона из обучения
             messages = [{"role": "user", "content": [{"type": "text", "text": t}]}]
             formatted_prompts.append(self.tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False))
+        toks = self.tokenizer(
+            formatted_prompts,
+            padding="max_length",
+            max_length=self.MAX_TEXT_TOKENS,
+            truncation=True,
+            return_tensors="pt"
+        ).to(device)
+        outputs = self.text_encoder(
+            input_ids=toks.input_ids,
+            attention_mask=toks.attention_mask,
+            output_hidden_states=True
+        )
+        # Берем предпоследний слой (-2) как в обучении
+        last_hidden = outputs.hidden_states[-2].to(dtype=dtype)
+        # Обнуляем паддинги для честности (как в обучении)
+        lengths = toks.attention_mask.sum(dim=1)
+        for i, length in enumerate(lengths):
+            last_hidden[i, length:] = 0
+        return last_hidden, toks.attention_mask.to(dtype=torch.int64)
     @torch.no_grad()
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 1152,
+        width: int = 768,
         num_inference_steps: int = 40,
         guidance_scale: float = 4.0,
         seed: Optional[int] = None,
         output_type: str = "pil",
         return_dict: bool = True,
     ):
         device = self.device
         dtype = self.transformer.dtype
+        if seed is not None:
             generator = torch.Generator(device=device).manual_seed(seed)
+        else:
+            generator = None
         do_classifier_free_guidance = guidance_scale > 1.0
+        # 1. Encode Prompts
+        prompt_embeds, prompt_mask = self.encode_text(prompt)
         batch_size = prompt_embeds.shape[0]
         if do_classifier_free_guidance:
+            neg_text = negative_prompt if negative_prompt is not None else ([""] * batch_size)
+            neg_embeds, neg_mask = self.encode_text(neg_text)
+            # Конкатенация для батч-генерации (uncond + cond)
+            text_embeddings = torch.cat([neg_embeds, prompt_embeds], dim=0)
+            # В вашем обучении padding_mask в модель передавался как нули,
+            # но внутри трансформера обычно используется encoder_attention_mask
         else:
             text_embeddings = prompt_embeds
+        # 2. Prepare Timesteps (Flow Matching: от 1.0 к 0.0)
+        # В обучении t=1 был шумом, t=0 — данными.
+        timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, device=device, dtype=dtype)
+        # 3. Prepare Latents
         latent_h = height // self.vae_scale_factor
         latent_w = width // self.vae_scale_factor
         in_channels = self.transformer.config.in_channels
+        # В Flow Matching начальный шум имеет стандартное отклонение 1.0
+        latents = torch.randn(
+            (batch_size, in_channels, 1, latent_h, latent_w),
+            generator=generator,
+            device=device,
+            dtype=dtype
+        )
+        # Пустая маска как в обучении
+        padding_mask = torch.zeros((1, 1, latent_h, latent_w), device=device, dtype=dtype)
+        # 4. Denoising Loop (Euler Method)
+        for i in tqdm(range(num_inference_steps), desc="Sampling"):
+            t_curr = timesteps[i]
+            t_next = timesteps[i+1]
+            # Подготовка входа (CFG)
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            # Модель обучалась на t.flatten(), передаем как вектор [B]
+            t_vec = t_curr.expand(latent_model_input.shape[0])
+            # Предсказание "скорости" (v)
+            # Т.к. в обучении target = noise - clean, модель предсказывает направление к шуму
+            model_output = self.transformer(
                 hidden_states=latent_model_input,
+                timestep=t_vec,
                 encoder_hidden_states=text_embeddings,
                 padding_mask=padding_mask,
                 return_dict=False,
             )[0]
             if do_classifier_free_guidance:
+                v_uncond, v_cond = model_output.chunk(2)
+                v_final = v_uncond + guidance_scale * (v_cond - v_uncond)
+            else:
+                v_final = model_output
+            # Euler шаг: x_{t-1} = x_t + (t_next - t_curr) * v
+            # Поскольку t идет от 1 к 0, (t_next - t_curr) будет отрицательным,
+            # что правильно двигает нас от шума к данным.
+            latents = latents + (t_next - t_curr) * v_final
+        # 5. Decode
         if output_type == "latent":
             return SdxsPipelineOutput(images=latents)
+        # Применяем де-нормализацию VAE как в обучении
+        if getattr(self.vae.config, "latents_std", None) is not None:
             l_mean = torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(device, dtype)
             l_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(device, dtype)
             latents = latents * l_std + l_mean
+        # Декодируем
+        with torch.no_grad():
+            image_output = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
         if image_output.ndim == 5:
+            image_output = image_output.squeeze(2) # Убираем временную ось (Frames=1)
         image_output = (image_output.clamp(-1, 1) + 1) / 2
         image_np = image_output.cpu().permute(0, 2, 3, 1).float().numpy()
+        image_np = np.nan_to_num(image_np, nan=0.0)
         if output_type == "pil":
+            images = [Image.fromarray((img * 255).round().astype("uint8")) for img in image_np]
         else:
             images = image_np
+        return SdxsPipelineOutput(images=images, prompt=prompt)

refined.jpg ADDED Viewed

Git LFS Details

SHA256: b08900de198c3d22e7e5dea378269caf74a681d050e677a8c4c299f35fd1f34f
Pointer size: 131 Bytes
Size of remote file: 104 kB

scheduler/.ipynb_checkpoints/scheduler_config-checkpoint.json DELETED Viewed

@@ -1,22 +0,0 @@
-{
-  "_class_name": "FlowMatchEulerDiscreteScheduler",
-  "_diffusers_version": "0.34.0.dev0",
-  "base_image_seq_len": 256,
-  "base_shift": 0.5,
-  "final_sigmas_type": "sigma_min",
-  "invert_sigmas": false,
-  "max_image_seq_len": 4096,
-  "max_shift": 1.15,
-  "num_train_timesteps": 1000,
-  "shift": 1.0,
-  "shift_terminal": null,
-  "sigma_data": 1.0,
-  "sigma_max": 80.0,
-  "sigma_min": 0.002,
-  "stochastic_sampling": false,
-  "time_shift_type": "exponential",
-  "use_beta_sigmas": false,
-  "use_dynamic_shifting": false,
-  "use_exponential_sigmas": false,
-  "use_karras_sigmas": true
-}

scheduler/scheduler_config.json CHANGED Viewed

@@ -1,22 +1,3 @@
-{
-  "_class_name": "FlowMatchEulerDiscreteScheduler",
-  "_diffusers_version": "0.34.0.dev0",
-  "base_image_seq_len": 256,
-  "base_shift": 0.5,
-  "final_sigmas_type": "sigma_min",
-  "invert_sigmas": false,
-  "max_image_seq_len": 4096,
-  "max_shift": 1.15,
-  "num_train_timesteps": 1000,
-  "shift": 1.0,
-  "shift_terminal": null,
-  "sigma_data": 1.0,
-  "sigma_max": 80.0,
-  "sigma_min": 0.002,
-  "stochastic_sampling": false,
-  "time_shift_type": "exponential",
-  "use_beta_sigmas": false,
-  "use_dynamic_shifting": false,
-  "use_exponential_sigmas": false,
-  "use_karras_sigmas": true
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:65b3e9ccde6e3727aab1c612e7279599f861aec2fb9354880ab9ef8753c492b6
+size 485

test.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:677906d20fb691440965fb107de2c9d8e9b7c75884d9e3e15b4375f4257df8ae
-size 21416092

 version https://git-lfs.github.com/spec/v1
+oid sha256:feff1f3730b8dae616e3ffd24b2f74dcd9c6776c46e00ac72018e0de74785d06
+size 18136603

train-sdxs2b.py CHANGED Viewed

@@ -17,7 +17,7 @@ from torch.utils.data import DataLoader, Sampler
 from torch.optim.lr_scheduler import LambdaLR
 from collections import defaultdict
 from accelerate import Accelerator
-from datasets import load_from_disk
 from tqdm import tqdm
 from PIL import Image, ImageOps
 from torch.utils.checkpoint import checkpoint
@@ -33,7 +33,7 @@ os.environ["NCCL_IB_DISABLE"] = "1" # comment this on H100!
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # --------------------------- Параметры ---------------------------
-ds_path = "/root/ds12345_640_vae_qwen"
 project = "transformer"
 gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
@@ -81,8 +81,8 @@ torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.enable_flash_sdp(True)
 torch.backends.cuda.enable_mem_efficient_sdp(True)
-torch.backends.cuda.enable_math_sdp(False)
-save_barrier = 1.25
 warmup_percent = 0.0025
 betta2 = 0.997
 eps = 1e-6
@@ -223,7 +223,7 @@ def encode_texts(text, max_length=max_length):
     for i, length in enumerate(lengths):
         hidden[i, length:] = 0
-    return hidden, toks.attention_mask.to(dtype=torch.int64)
 @torch.no_grad()
 def encode_texts_fast(text, max_length=max_length):
@@ -244,7 +244,7 @@ def encode_texts_fast(text, max_length=max_length):
     for i, length in enumerate(lengths):
         last_hidden[i, length:] = 0
-    return last_hidden, toks.attention_mask.to(dtype=torch.int64)
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:
@@ -375,7 +375,7 @@ def get_fixed_samples_by_resolution(dataset, samples_per_group=1):
             masks = torch.tensor(
                 np.array([item["attention_mask"] for item in samples_data]),
                 device=device,
-                dtype=torch.int64
             )
         else:
             embeddings, masks = encode_texts(texts,max_length)
@@ -388,7 +388,30 @@ def get_fixed_samples_by_resolution(dataset, samples_per_group=1):
 if limit > 0:
     dataset = load_from_disk(ds_path).select(range(limit))
 else:
-    dataset = load_from_disk(ds_path)
 print(f"images: {len(dataset)}")
@@ -424,7 +447,7 @@ def collate_fn_simple(batch):
     ]
     embeddings, attention_mask = encode_texts(texts,max_length)
-    attention_mask = attention_mask.to(dtype=torch.int64)
     return latents, embeddings, attention_mask
@@ -552,7 +575,7 @@ def get_negative_embedding(neg_prompt="", batch_size=1):
         hidden_dim = 2048
         seq_len = max_length
         empty_emb = torch.zeros((batch_size, seq_len, hidden_dim), dtype=dtype, device=device)
-        empty_mask = torch.ones((batch_size, seq_len), dtype=torch.int64, device=device)
         return empty_emb, empty_mask
     uncond_emb, uncond_mask  = encode_texts([neg_prompt],max_length)

 from torch.optim.lr_scheduler import LambdaLR
 from collections import defaultdict
 from accelerate import Accelerator
+from datasets import load_from_disk,concatenate_datasets
 from tqdm import tqdm
 from PIL import Image, ImageOps
 from torch.utils.checkpoint import checkpoint
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # --------------------------- Параметры ---------------------------
+ds_path = "datasets/dsb_640_vae_qwen_temp"
 project = "transformer"
 gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.enable_flash_sdp(True)
 torch.backends.cuda.enable_mem_efficient_sdp(True)
+torch.backends.cuda.enable_math_sdp(True)
+save_barrier = 1.4
 warmup_percent = 0.0025
 betta2 = 0.997
 eps = 1e-6
     for i, length in enumerate(lengths):
         hidden[i, length:] = 0
+    return hidden, toks.attention_mask.to(dtype=torch.bool)
 @torch.no_grad()
 def encode_texts_fast(text, max_length=max_length):
     for i, length in enumerate(lengths):
         last_hidden[i, length:] = 0
+    return last_hidden, toks.attention_mask.to(dtype=torch.bool)
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:
             masks = torch.tensor(
                 np.array([item["attention_mask"] for item in samples_data]),
                 device=device,
+                dtype=torch.bool
             )
         else:
             embeddings, masks = encode_texts(texts,max_length)
 if limit > 0:
     dataset = load_from_disk(ds_path).select(range(limit))
 else:
+    print(">>> Поиск чанков датасета...")
+    chunks = []
+    for d in os.listdir(ds_path):
+        full_p = os.path.join(ds_path, d)
+        if os.path.isdir(full_p):
+            chunks.append(full_p)
+    if not chunks:
+        print("❌ Чанки не найдены!")
+    print(f">>> Найдено чанков: {len(chunks)}. Начинаю загрузку и объединение...")
+    # 2. Ленивая загрузка всех чанков
+    # load_from_disk не ест RAM, пока мы не обращаемся к данным
+    ds_list = []
+    for c in chunks:
+        try:
+            ds_list.append(load_from_disk(c))
+        except Exception as e:
+            print(f"⚠️ Ошибка загрузки чанка {c}: {e}")
+    # 3. Конкатенация (создает виртуальный объединенный датасет)
+    dataset = concatenate_datasets(ds_list)
+    #dataset = load_from_disk(ds_path)
 print(f"images: {len(dataset)}")
     ]
     embeddings, attention_mask = encode_texts(texts,max_length)
+    attention_mask = attention_mask.to(dtype=torch.bool)
     return latents, embeddings, attention_mask
         hidden_dim = 2048
         seq_len = max_length
         empty_emb = torch.zeros((batch_size, seq_len, hidden_dim), dtype=dtype, device=device)
+        empty_mask = torch.ones((batch_size, seq_len), dtype=torch.bool, device=device)
         return empty_emb, empty_mask
     uncond_emb, uncond_mask  = encode_texts([neg_prompt],max_length)

transformer/diffusion_pytorch_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3188c7ffba9a6fb9e646536503a99a8e0b1530251793ab1f5ff4b73b4df04542
-size 7825687184

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee77d7083d1f968607fbcc531deae347d72c2fb229bbe40356e44a8edae26aec
+size 3912877104

wandb/debug-internal.log DELETED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20260513_080408-xhrf3max/logs/debug-internal.log

wandb/debug.log DELETED Viewed

@@ -1,19 +0,0 @@
-2026-05-05 07:53:13,457 INFO    MainThread:43955 [wandb_setup.py:_flush():81] Current SDK version is 0.26.1
-2026-05-05 07:53:13,457 INFO    MainThread:43955 [wandb_setup.py:_flush():81] Configure stats pid to 43955
-2026-05-05 07:53:13,457 INFO    MainThread:43955 [wandb_setup.py:_flush():81] Loading settings from environment variables
-2026-05-05 07:53:13,457 INFO    MainThread:43955 [wandb_init.py:setup_run_log_directory():723] Logging user logs to /workspace/2b/wandb/run-20260505_075313-ti70f47q/logs/debug.log
-2026-05-05 07:53:13,458 INFO    MainThread:43955 [wandb_init.py:setup_run_log_directory():724] Logging internal logs to /workspace/2b/wandb/run-20260505_075313-ti70f47q/logs/debug-internal.log
-2026-05-05 07:53:13,458 INFO    MainThread:43955 [wandb_init.py:init():850] calling init triggers
-2026-05-05 07:53:13,458 INFO    MainThread:43955 [wandb_init.py:init():855] wandb.init called with sweep_config: {}
-config: {'batch_size': 24, 'base_learning_rate': 1.3333333333333335e-05, 'num_epochs': 1, 'optimizer_type': 'adafactor', '_wandb': {}}
-2026-05-05 07:53:13,458 INFO    MainThread:43955 [wandb_init.py:init():898] starting backend
-2026-05-05 07:53:13,663 INFO    MainThread:43955 [wandb_init.py:init():913] sending inform_init request
-2026-05-05 07:53:13,842 INFO    MainThread:43955 [wandb_init.py:init():918] backend started and connected
-2026-05-05 07:53:13,844 INFO    MainThread:43955 [wandb_init.py:init():988] updated telemetry
-2026-05-05 07:53:13,845 INFO    MainThread:43955 [wandb_init.py:init():1011] communicating run to backend with 90.0 second timeout
-2026-05-05 07:53:14,174 INFO    MainThread:43955 [wandb_init.py:init():1056] starting run threads in backend
-2026-05-05 07:53:14,261 INFO    MainThread:43955 [wandb_run.py:_console_start():2554] atexit reg
-2026-05-05 07:53:14,262 INFO    MainThread:43955 [wandb_run.py:_redirect():2403] redirect: wrap_raw
-2026-05-05 07:53:14,262 INFO    MainThread:43955 [wandb_run.py:_redirect():2472] Wrapping output streams.
-2026-05-05 07:53:14,262 INFO    MainThread:43955 [wandb_run.py:_redirect():2495] Redirects installed.
-2026-05-05 07:53:14,267 INFO    MainThread:43955 [wandb_init.py:init():1094] run started, returning control to user process

wandb/debug.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20260513_080408-xhrf3max/logs/debug.log