recoilme commited on Nov 30, 2025

Commit

19de470

1 Parent(s): 4265f4a

2511

Browse files

Files changed (14) hide show

dataset.py +57 -83
samples/unet_320x640_0.jpg +2 -2
samples/unet_384x640_0.jpg +2 -2
samples/unet_448x640_0.jpg +2 -2
samples/unet_512x640_0.jpg +2 -2
samples/unet_576x640_0.jpg +2 -2
samples/unet_640x320_0.jpg +2 -2
samples/unet_640x384_0.jpg +2 -2
samples/unet_640x448_0.jpg +2 -2
samples/unet_640x512_0.jpg +2 -2
samples/unet_640x576_0.jpg +2 -2
samples/unet_640x640_0.jpg +2 -2
src/untar.sh +2 -0
unet/diffusion_pytorch_model.safetensors +1 -1

dataset.py CHANGED Viewed

@@ -16,22 +16,19 @@ from typing import Dict, List, Tuple, Optional, Any
 from PIL import Image
 from tqdm import tqdm
 from datetime import timedelta
-import subprocess
-import tempfile
 # ---------------- 1️⃣ Настройки ----------------
 dtype = torch.float16
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 batch_size = 10
-min_size = 320 #192 #256 #192
-max_size = 640 #384 #256 #384
-step = 64 #64
-empty_share = 0.05
 limit = 0
 # Основная процедура обработки
-folder_path = "/workspace/tar" #alchemist"
-save_path = "/workspace/640" #"alchemist"
-dir_tmp = "/workspace/tmp"
 os.makedirs(save_path, exist_ok=True)
 # Функция для очистки CUDA памяти
@@ -45,18 +42,21 @@ def clear_cuda_memory():
 # ---------------- 2️⃣ Загрузка моделей ----------------
 def load_models():
     print("Загрузка моделей...")
-    vae = AutoencoderKL.from_pretrained("AiArtLab/sdxs3d",subfolder="vae",torch_dtype=dtype).to(device).eval()
-    model_name = "Qwen/Qwen3-0.6B"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=dtype,
-        device_map=device
-    ).eval()
-    return vae, model, tokenizer
-vae, model, tokenizer = load_models()
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:
@@ -136,7 +136,7 @@ def last_token_pool(last_hidden_states: torch.Tensor,
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
-def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150):
     with torch.inference_mode():
         # Токенизация
         batch = tokenizer(
@@ -147,16 +147,32 @@ def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150):
             max_length=max_length
         ).to(device)
         # Прогон через базовую модель (внутри CausalLM)
         outputs = model.model(**batch, output_hidden_states=True)
         # Берем последний слой (эмбеддинги всех токенов)
-        hidden_states = outputs.last_hidden_state
     return hidden_states.cpu().numpy() # embeddings.unsqueeze(1).cpu().numpy()
 def clean_label(label):
-    label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "")
     return label
 def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
@@ -223,11 +239,11 @@ def encode_to_latents(images, texts):
         text_labels = [clean_label(text) for text in texts]
         model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
-        embeddings = encode_texts_batch(model_prompts, tokenizer, model)
         return {
             "vae": latents_np,
-            "embeddings": embeddings,
             "text": text_labels,
             "width": widths,
             "height": heights
@@ -341,10 +357,6 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
             # Сохраняем результаты группы
             group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_size_{size_key[0]}x{size_key[1]}"
             processed_group.save_to_disk(group_save_path)
-            subprocess.run(["tar", "-I", "zstd", "-cf", f"{group_save_path}.tar.zst", "-C", group_save_path, "."], check=True)
-            shutil.rmtree(group_save_path)
             clear_cuda_memory()
             elapsed = time.time() - start_time
             processed = (chunk_idx - 1) * chunk_size + sum([len(sg["image_paths"]) for sg in list(size_groups.values())[:list(size_groups.values()).index(group_data) + 1]])
@@ -354,60 +366,22 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
                 remaining_str = str(timedelta(seconds=int(remaining)))
                 print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
-# ---------------- 7️⃣ Объединение ----------------
-def safe_repack(ds):
-    """
-    Перепаковка датасета в маленькие .arrow файлы.
-    ЧТО ДЕЛАЕТ:
-      - уменьшает Arrow chunk size
-      - writer_batch_size=1000 → Arrow 30–60 MB
-    """
-    return ds.map(lambda x: x, batched=True, writer_batch_size=1000)
 def combine_chunks(temp_path, final_path):
-    archives = sorted(
-        f for f in os.listdir(temp_path)
-        if f.endswith(".tar.zst")
-    )
-    archives = [os.path.join(temp_path, f) for f in archives]
-    print(f"Найдено {len(archives)} архивов.")
-    # Инициализируем пустой датасет
-    merged_ds = None
-    for i, arc in enumerate(archives):
-        print(f"[{i+1}/{len(archives)}] Обрабатываю {arc}")
-        # Распаковка
-        tmp = tempfile.mkdtemp(dir=dir_tmp)
-        subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
-        # Загрузка датасета
-        ds = load_from_disk(tmp)
-        # Перепаковка чанк-датасета, чтобы уменьшить Arrow-файлы
-        ds = safe_repack(ds)
-        # Мерж
-        if merged_ds is None:
-            merged_ds = ds
-        else:
-            merged_ds = concatenate_datasets([merged_ds, ds])
-        # cleanup
-        shutil.rmtree(tmp)
-        os.remove(arc)
-    # Финальная перепаковка
-    print("⚙️ Финальная перепаковка...")
-    merged_ds = safe_repack(merged_ds)
-    print("💾 Финальное сохранение...")
-    merged_ds.save_to_disk(final_path)
-    print(f"✅ Датасет сохранён: {final_path}")
 # Создаем временную папку для чанков
 temp_path = f"{save_path}_temp"
@@ -418,7 +392,7 @@ image_paths, text_paths, width, height = process_folder(folder_path,limit)
 print(f"Всего найдено {len(image_paths)} изображений")
 # Обработка с чанкованием
-process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000, batch_size=batch_size)
 # Объединение чанков в финальный датасет
 combine_chunks(temp_path, save_path)

 from PIL import Image
 from tqdm import tqdm
 from datetime import timedelta
 # ---------------- 1️⃣ Настройки ----------------
 dtype = torch.float16
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 batch_size = 10
+min_size = 384 #320 #192 #256 #192
+max_size = 768 #640 #384 #256 #384
+step = 32 #64
+empty_share = 0.0
 limit = 0
 # Основная процедура обработки
+folder_path = "/workspace/dataset/dataset/ae3" #alchemist"
+save_path = "/workspace/ae3_768" #"alchemist"
 os.makedirs(save_path, exist_ok=True)
 # Функция для очистки CUDA памяти
 # ---------------- 2️⃣ Загрузка моделей ----------------
 def load_models():
     print("Загрузка моделей...")
+    vae = AutoencoderKL.from_pretrained("AiArtLab/sdxs",subfolder="vae1x",torch_dtype=dtype).to(device).eval()
+    #model_name = "Qwen/Qwen3-0.6B"
+    #tokenizer = AutoTokenizer.from_pretrained(model_name)
+    #model = AutoModelForCausalLM.from_pretrained(
+    #    model_name,
+    #    torch_dtype=dtype,
+    #    device_map=device
+    #).eval()
+    #tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
+    #model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B').to("cuda")
+    return vae#, model, tokenizer
+#vae, model, tokenizer = load_models()
+vae = load_models()
 shift_factor = getattr(vae.config, "shift_factor", 0.0)
 if shift_factor is None:
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150, normalize=False):
     with torch.inference_mode():
         # Токенизация
         batch = tokenizer(
             max_length=max_length
         ).to(device)
+        # Прогон через модель
+        #outputs = model(**batch)
+        # Пулинг по last token
+        #embeddings = last_token_pool(outputs.last_hidden_state, batch["attention_mask"])
+        # L2-нормализация (опционально, обычно нужна для семантического поиска)
+        #if normalize:
+        #    embeddings = F.normalize(embeddings, p=2, dim=1)
         # Прогон через базовую модель (внутри CausalLM)
         outputs = model.model(**batch, output_hidden_states=True)
         # Берем последний слой (эмбеддинги всех токенов)
+        hidden_states = outputs.hidden_states[-1]  # [B, L, D]
+        # Можно применить нормализацию по каждому токену (как в CLIP)
+        if normalize:
+            hidden_states = F.normalize(hidden_states, p=2, dim=-1)
     return hidden_states.cpu().numpy() # embeddings.unsqueeze(1).cpu().numpy()
 def clean_label(label):
+    label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "").replace("The image depicts ","").replace("The image presents ","").replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
+    if label.startswith("."):
+        label = label[1:].lstrip()
     return label
 def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
         text_labels = [clean_label(text) for text in texts]
         model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
+        #embeddings = encode_texts_batch(model_prompts, tokenizer, model)
         return {
             "vae": latents_np,
+            #"embeddings": embeddings,
             "text": text_labels,
             "width": widths,
             "height": heights
             # Сохраняем результаты группы
             group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_size_{size_key[0]}x{size_key[1]}"
             processed_group.save_to_disk(group_save_path)
             clear_cuda_memory()
             elapsed = time.time() - start_time
             processed = (chunk_idx - 1) * chunk_size + sum([len(sg["image_paths"]) for sg in list(size_groups.values())[:list(size_groups.values()).index(group_data) + 1]])
                 remaining_str = str(timedelta(seconds=int(remaining)))
                 print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
+# ---------------- 7️⃣ Объединение чанков ----------------
 def combine_chunks(temp_path, final_path):
+    """Объединение обработанных чанков в финальный датасет"""
+    chunks = sorted([
+        os.path.join(temp_path, d)
+        for d in os.listdir(temp_path)
+        if d.startswith("chunk_")
+    ])
+    datasets = [load_from_disk(chunk) for chunk in chunks]
+    combined = concatenate_datasets(datasets)
+    combined.save_to_disk(final_path)
+    print(f"✅ Датасет успешно сохранен в: {final_path}")
 # Создаем временную папку для чанков
 temp_path = f"{save_path}_temp"
 print(f"Всего найдено {len(image_paths)} изображений")
 # Обработка с чанкованием
+process_in_chunks(image_paths, text_paths, width, height, chunk_size=20000, batch_size=batch_size)
 # Объединение чанков в финальный датасет
 combine_chunks(temp_path, save_path)

samples/unet_320x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: e414aa0a90d55a49d737a74608c8f33b57d58f03b0794b1bf6ef1da749258edb
Pointer size: 130 Bytes
Size of remote file: 55.2 kB

Git LFS Details

SHA256: cbd28220db18f95d0fee2e027af5bc8829b9d3ee788c1d94b521d5c37c368983
Pointer size: 130 Bytes
Size of remote file: 62.4 kB

samples/unet_384x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 4542cf64e389bedc4a59eb55162569e6baf78b333f7b4eb12ede83e8314627f0
Pointer size: 131 Bytes
Size of remote file: 130 kB

Git LFS Details

SHA256: c63700ae77198f9ffe16431218661d6703daaf085b5ebc5b9f5591b582d16bfd
Pointer size: 131 Bytes
Size of remote file: 159 kB

samples/unet_448x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: b5336b5074ddf580d53550dca84bff8e8df33e9c41e14535ae254e1632d8b39c
Pointer size: 130 Bytes
Size of remote file: 63.1 kB

Git LFS Details

SHA256: 465c5bf3e1b611dd046d0d28b1bc4850c2562fef837bf85816ae00959f28f78c
Pointer size: 130 Bytes
Size of remote file: 81.7 kB

samples/unet_512x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: db3f1f8a85554109e263c5315d54db1bd644691f8cfada79008ec82f26eb71b3
Pointer size: 131 Bytes
Size of remote file: 123 kB

Git LFS Details

SHA256: e94978b4e71b7b45166215ce68d794a95dcd5ad4f3ea39f31be1189110fd958f
Pointer size: 131 Bytes
Size of remote file: 131 kB

samples/unet_576x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 7cde37bc4f9e1e78c48bcc920d5b35e891a8c04166f5cde12c8956de1a0c98f1
Pointer size: 131 Bytes
Size of remote file: 222 kB

Git LFS Details

SHA256: e3cd197d0f08c8da4e5e3c51258c4dae62ee498dfef2d876d9c0e93a7a2d8fc4
Pointer size: 131 Bytes
Size of remote file: 244 kB

samples/unet_640x320_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 2dfab00d9532e20fda124e6c03e17eb60bdd0f99411da359d0eac87a5d46ed64
Pointer size: 130 Bytes
Size of remote file: 73.5 kB

Git LFS Details

SHA256: bbaebc70f65baa27de7ef43a6028c9bf11a73d7cf2874e402183edc590006ee8
Pointer size: 130 Bytes
Size of remote file: 88.3 kB

samples/unet_640x384_0.jpg CHANGED Viewed

Git LFS Details

SHA256: d8858222a7168440d61651cb636460ccfa6160261a3c1ee02cdcfa64945ed84e
Pointer size: 131 Bytes
Size of remote file: 115 kB

Git LFS Details

SHA256: 31315d6c63ecadd9ba9a9d568ae00047c2236a8d9cf87e9e357e477db0e427fc
Pointer size: 131 Bytes
Size of remote file: 110 kB

samples/unet_640x448_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 86ae9eca3fb4ede85edfeebea73dfa5ef37073d5a4ee98c1cd5f3b3fea6227fb
Pointer size: 130 Bytes
Size of remote file: 97.2 kB

Git LFS Details

SHA256: b98d7ce8c61c9670407c5480614e695a8ccfee23c37fd08297edc6e8b4f59355
Pointer size: 130 Bytes
Size of remote file: 92.2 kB

samples/unet_640x512_0.jpg CHANGED Viewed

Git LFS Details

SHA256: cb61b32bfc4874bd5ce6e8000b42aa857ae3cbbcdaadc15cb975af3de551ec55
Pointer size: 131 Bytes
Size of remote file: 106 kB

Git LFS Details

SHA256: 5f29af11e61ea1773a84ab8cc9fa880bb60c1fdef4584be9157e9f1f8adb3d19
Pointer size: 131 Bytes
Size of remote file: 122 kB

samples/unet_640x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: acfbd867ca33935affa45e0df87423585e4c491fe19aa18cd89d15853658b128
Pointer size: 131 Bytes
Size of remote file: 164 kB

Git LFS Details

SHA256: e4263e6d85ffbbf5b0fd0cb3545ffb9fb9a081e73098e714e853f196a000845e
Pointer size: 131 Bytes
Size of remote file: 182 kB

samples/unet_640x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: c10691ff09a6e6f55f71a46acba37b0ac1dadc66903b715e9b18b42ccc158ba4
Pointer size: 131 Bytes
Size of remote file: 157 kB

Git LFS Details

SHA256: c839b3deba43cf54c33c903c7df431797f17c5734cf766ae3c0c32d84a3a6847
Pointer size: 131 Bytes
Size of remote file: 202 kB

src/untar.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/bash
2	+ find . -maxdepth 1 -type f \( -name ".tar" -o -name ".tgz" -o -name ".tar.bz2" \) -exec sh -c 'tar -xf "{}" && rm "{}"' \;

unet/diffusion_pytorch_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8820baa1b4aab525952d765f0162a0fcaf2a93641ffd8146683880c17b31c71e
 size 6205958296

 version https://git-lfs.github.com/spec/v1
+oid sha256:08b55f95640f0615bc957b7e0641973220578146f32d1647f900fa74c93f1f4d
 size 6205958296