2511
Browse files- dataset.py +57 -83
- samples/unet_320x640_0.jpg +2 -2
- samples/unet_384x640_0.jpg +2 -2
- samples/unet_448x640_0.jpg +2 -2
- samples/unet_512x640_0.jpg +2 -2
- samples/unet_576x640_0.jpg +2 -2
- samples/unet_640x320_0.jpg +2 -2
- samples/unet_640x384_0.jpg +2 -2
- samples/unet_640x448_0.jpg +2 -2
- samples/unet_640x512_0.jpg +2 -2
- samples/unet_640x576_0.jpg +2 -2
- samples/unet_640x640_0.jpg +2 -2
- src/untar.sh +2 -0
- unet/diffusion_pytorch_model.safetensors +1 -1
dataset.py
CHANGED
|
@@ -16,22 +16,19 @@ from typing import Dict, List, Tuple, Optional, Any
|
|
| 16 |
from PIL import Image
|
| 17 |
from tqdm import tqdm
|
| 18 |
from datetime import timedelta
|
| 19 |
-
import subprocess
|
| 20 |
-
import tempfile
|
| 21 |
|
| 22 |
# ---------------- 1️⃣ Настройки ----------------
|
| 23 |
dtype = torch.float16
|
| 24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 25 |
batch_size = 10
|
| 26 |
-
min_size = 320 #192 #256 #192
|
| 27 |
-
max_size = 640 #384 #256 #384
|
| 28 |
-
step =
|
| 29 |
-
empty_share = 0.
|
| 30 |
limit = 0
|
| 31 |
# Основная процедура обработки
|
| 32 |
-
folder_path = "/workspace/
|
| 33 |
-
save_path = "/workspace/
|
| 34 |
-
dir_tmp = "/workspace/tmp"
|
| 35 |
os.makedirs(save_path, exist_ok=True)
|
| 36 |
|
| 37 |
# Функция для очистки CUDA памяти
|
|
@@ -45,18 +42,21 @@ def clear_cuda_memory():
|
|
| 45 |
# ---------------- 2️⃣ Загрузка моделей ----------------
|
| 46 |
def load_models():
|
| 47 |
print("Загрузка моделей...")
|
| 48 |
-
vae = AutoencoderKL.from_pretrained("AiArtLab/
|
| 49 |
-
|
| 50 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 51 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 52 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
).eval()
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
vae
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
shift_factor = getattr(vae.config, "shift_factor", 0.0)
|
| 62 |
if shift_factor is None:
|
|
@@ -136,7 +136,7 @@ def last_token_pool(last_hidden_states: torch.Tensor,
|
|
| 136 |
batch_size = last_hidden_states.shape[0]
|
| 137 |
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
| 138 |
|
| 139 |
-
def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150):
|
| 140 |
with torch.inference_mode():
|
| 141 |
# Токенизация
|
| 142 |
batch = tokenizer(
|
|
@@ -147,16 +147,32 @@ def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150):
|
|
| 147 |
max_length=max_length
|
| 148 |
).to(device)
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
# Прогон через базовую модель (внутри CausalLM)
|
| 151 |
outputs = model.model(**batch, output_hidden_states=True)
|
| 152 |
|
| 153 |
# Берем последний слой (эмбеддинги всех токенов)
|
| 154 |
-
hidden_states = outputs.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
return hidden_states.cpu().numpy() # embeddings.unsqueeze(1).cpu().numpy()
|
| 157 |
|
| 158 |
def clean_label(label):
|
| 159 |
-
label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "")
|
|
|
|
|
|
|
| 160 |
return label
|
| 161 |
|
| 162 |
def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
|
|
@@ -223,11 +239,11 @@ def encode_to_latents(images, texts):
|
|
| 223 |
text_labels = [clean_label(text) for text in texts]
|
| 224 |
|
| 225 |
model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
|
| 226 |
-
embeddings = encode_texts_batch(model_prompts, tokenizer, model)
|
| 227 |
|
| 228 |
return {
|
| 229 |
"vae": latents_np,
|
| 230 |
-
"embeddings": embeddings,
|
| 231 |
"text": text_labels,
|
| 232 |
"width": widths,
|
| 233 |
"height": heights
|
|
@@ -341,10 +357,6 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
|
|
| 341 |
# Сохраняем результаты группы
|
| 342 |
group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_size_{size_key[0]}x{size_key[1]}"
|
| 343 |
processed_group.save_to_disk(group_save_path)
|
| 344 |
-
|
| 345 |
-
subprocess.run(["tar", "-I", "zstd", "-cf", f"{group_save_path}.tar.zst", "-C", group_save_path, "."], check=True)
|
| 346 |
-
shutil.rmtree(group_save_path)
|
| 347 |
-
|
| 348 |
clear_cuda_memory()
|
| 349 |
elapsed = time.time() - start_time
|
| 350 |
processed = (chunk_idx - 1) * chunk_size + sum([len(sg["image_paths"]) for sg in list(size_groups.values())[:list(size_groups.values()).index(group_data) + 1]])
|
|
@@ -354,60 +366,22 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
|
|
| 354 |
remaining_str = str(timedelta(seconds=int(remaining)))
|
| 355 |
print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
|
| 356 |
|
| 357 |
-
# ---------------- 7️⃣ Объединение ----------------
|
| 358 |
-
def safe_repack(ds):
|
| 359 |
-
"""
|
| 360 |
-
Перепаковка датасета в маленькие .arrow файлы.
|
| 361 |
-
ЧТО ДЕЛАЕТ:
|
| 362 |
-
- уменьшает Arrow chunk size
|
| 363 |
-
- writer_batch_size=1000 → Arrow 30–60 MB
|
| 364 |
-
"""
|
| 365 |
-
return ds.map(lambda x: x, batched=True, writer_batch_size=1000)
|
| 366 |
-
|
| 367 |
-
|
| 368 |
def combine_chunks(temp_path, final_path):
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
print(f"[{i+1}/{len(archives)}] Обрабатываю {arc}")
|
| 382 |
-
|
| 383 |
-
# Распаковка
|
| 384 |
-
tmp = tempfile.mkdtemp(dir=dir_tmp)
|
| 385 |
-
subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
|
| 386 |
-
|
| 387 |
-
# Загрузка датасета
|
| 388 |
-
ds = load_from_disk(tmp)
|
| 389 |
-
|
| 390 |
-
# Перепаковка чанк-датасета, чтобы уменьшить Arrow-файлы
|
| 391 |
-
ds = safe_repack(ds)
|
| 392 |
-
|
| 393 |
-
# Мерж
|
| 394 |
-
if merged_ds is None:
|
| 395 |
-
merged_ds = ds
|
| 396 |
-
else:
|
| 397 |
-
merged_ds = concatenate_datasets([merged_ds, ds])
|
| 398 |
-
|
| 399 |
-
# cleanup
|
| 400 |
-
shutil.rmtree(tmp)
|
| 401 |
-
os.remove(arc)
|
| 402 |
-
|
| 403 |
-
# Финальная перепаковка
|
| 404 |
-
print("⚙️ Финальная перепаковка...")
|
| 405 |
-
merged_ds = safe_repack(merged_ds)
|
| 406 |
-
|
| 407 |
-
print("💾 Финальное сохранение...")
|
| 408 |
-
merged_ds.save_to_disk(final_path)
|
| 409 |
|
| 410 |
-
|
| 411 |
|
| 412 |
# Создаем временную папку для чанков
|
| 413 |
temp_path = f"{save_path}_temp"
|
|
@@ -418,7 +392,7 @@ image_paths, text_paths, width, height = process_folder(folder_path,limit)
|
|
| 418 |
print(f"Всего найдено {len(image_paths)} изображений")
|
| 419 |
|
| 420 |
# Обработка с чанкованием
|
| 421 |
-
process_in_chunks(image_paths, text_paths, width, height, chunk_size=
|
| 422 |
|
| 423 |
# Объединение чанков в финальный датасет
|
| 424 |
combine_chunks(temp_path, save_path)
|
|
|
|
| 16 |
from PIL import Image
|
| 17 |
from tqdm import tqdm
|
| 18 |
from datetime import timedelta
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# ---------------- 1️⃣ Настройки ----------------
|
| 21 |
dtype = torch.float16
|
| 22 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 23 |
batch_size = 10
|
| 24 |
+
min_size = 384 #320 #192 #256 #192
|
| 25 |
+
max_size = 768 #640 #384 #256 #384
|
| 26 |
+
step = 32 #64
|
| 27 |
+
empty_share = 0.0
|
| 28 |
limit = 0
|
| 29 |
# Основная процедура обработки
|
| 30 |
+
folder_path = "/workspace/dataset/dataset/ae3" #alchemist"
|
| 31 |
+
save_path = "/workspace/ae3_768" #"alchemist"
|
|
|
|
| 32 |
os.makedirs(save_path, exist_ok=True)
|
| 33 |
|
| 34 |
# Функция для очистки CUDA памяти
|
|
|
|
| 42 |
# ---------------- 2️⃣ Загрузка моделей ----------------
|
| 43 |
def load_models():
|
| 44 |
print("Загрузка моделей...")
|
| 45 |
+
vae = AutoencoderKL.from_pretrained("AiArtLab/sdxs",subfolder="vae1x",torch_dtype=dtype).to(device).eval()
|
| 46 |
+
|
| 47 |
+
#model_name = "Qwen/Qwen3-0.6B"
|
| 48 |
+
#tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 49 |
+
#model = AutoModelForCausalLM.from_pretrained(
|
| 50 |
+
# model_name,
|
| 51 |
+
# torch_dtype=dtype,
|
| 52 |
+
# device_map=device
|
| 53 |
+
#).eval()
|
| 54 |
+
#tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
|
| 55 |
+
#model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B').to("cuda")
|
| 56 |
+
return vae#, model, tokenizer
|
| 57 |
+
|
| 58 |
+
#vae, model, tokenizer = load_models()
|
| 59 |
+
vae = load_models()
|
| 60 |
|
| 61 |
shift_factor = getattr(vae.config, "shift_factor", 0.0)
|
| 62 |
if shift_factor is None:
|
|
|
|
| 136 |
batch_size = last_hidden_states.shape[0]
|
| 137 |
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
| 138 |
|
| 139 |
+
def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150, normalize=False):
|
| 140 |
with torch.inference_mode():
|
| 141 |
# Токенизация
|
| 142 |
batch = tokenizer(
|
|
|
|
| 147 |
max_length=max_length
|
| 148 |
).to(device)
|
| 149 |
|
| 150 |
+
# Прогон через модель
|
| 151 |
+
#outputs = model(**batch)
|
| 152 |
+
|
| 153 |
+
# Пулинг по last token
|
| 154 |
+
#embeddings = last_token_pool(outputs.last_hidden_state, batch["attention_mask"])
|
| 155 |
+
|
| 156 |
+
# L2-нормализация (опционально, обычно нужна для семантического поиска)
|
| 157 |
+
#if normalize:
|
| 158 |
+
# embeddings = F.normalize(embeddings, p=2, dim=1)
|
| 159 |
+
|
| 160 |
# Прогон через базовую модель (внутри CausalLM)
|
| 161 |
outputs = model.model(**batch, output_hidden_states=True)
|
| 162 |
|
| 163 |
# Берем последний слой (эмбеддинги всех токенов)
|
| 164 |
+
hidden_states = outputs.hidden_states[-1] # [B, L, D]
|
| 165 |
+
|
| 166 |
+
# Можно применить нормализацию по каждому токену (как в CLIP)
|
| 167 |
+
if normalize:
|
| 168 |
+
hidden_states = F.normalize(hidden_states, p=2, dim=-1)
|
| 169 |
|
| 170 |
return hidden_states.cpu().numpy() # embeddings.unsqueeze(1).cpu().numpy()
|
| 171 |
|
| 172 |
def clean_label(label):
|
| 173 |
+
label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "").replace("The image depicts ","").replace("The image presents ","").replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
|
| 174 |
+
if label.startswith("."):
|
| 175 |
+
label = label[1:].lstrip()
|
| 176 |
return label
|
| 177 |
|
| 178 |
def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
|
|
|
|
| 239 |
text_labels = [clean_label(text) for text in texts]
|
| 240 |
|
| 241 |
model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
|
| 242 |
+
#embeddings = encode_texts_batch(model_prompts, tokenizer, model)
|
| 243 |
|
| 244 |
return {
|
| 245 |
"vae": latents_np,
|
| 246 |
+
#"embeddings": embeddings,
|
| 247 |
"text": text_labels,
|
| 248 |
"width": widths,
|
| 249 |
"height": heights
|
|
|
|
| 357 |
# Сохраняем результаты группы
|
| 358 |
group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_size_{size_key[0]}x{size_key[1]}"
|
| 359 |
processed_group.save_to_disk(group_save_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
clear_cuda_memory()
|
| 361 |
elapsed = time.time() - start_time
|
| 362 |
processed = (chunk_idx - 1) * chunk_size + sum([len(sg["image_paths"]) for sg in list(size_groups.values())[:list(size_groups.values()).index(group_data) + 1]])
|
|
|
|
| 366 |
remaining_str = str(timedelta(seconds=int(remaining)))
|
| 367 |
print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
|
| 368 |
|
| 369 |
+
# ---------------- 7️⃣ Объединение чанков ----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
def combine_chunks(temp_path, final_path):
|
| 371 |
+
"""Объединение обработанных чанков в финальный датасет"""
|
| 372 |
+
chunks = sorted([
|
| 373 |
+
os.path.join(temp_path, d)
|
| 374 |
+
for d in os.listdir(temp_path)
|
| 375 |
+
if d.startswith("chunk_")
|
| 376 |
+
])
|
| 377 |
+
|
| 378 |
+
datasets = [load_from_disk(chunk) for chunk in chunks]
|
| 379 |
+
combined = concatenate_datasets(datasets)
|
| 380 |
+
combined.save_to_disk(final_path)
|
| 381 |
+
|
| 382 |
+
print(f"✅ Датасет успешно сохранен в: {final_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
+
|
| 385 |
|
| 386 |
# Создаем временную папку для чанков
|
| 387 |
temp_path = f"{save_path}_temp"
|
|
|
|
| 392 |
print(f"Всего найдено {len(image_paths)} изображений")
|
| 393 |
|
| 394 |
# Обработка с чанкованием
|
| 395 |
+
process_in_chunks(image_paths, text_paths, width, height, chunk_size=20000, batch_size=batch_size)
|
| 396 |
|
| 397 |
# Объединение чанков в финальный датасет
|
| 398 |
combine_chunks(temp_path, save_path)
|
samples/unet_320x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_384x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_448x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_512x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_576x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x320_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x384_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x448_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x512_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x576_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
src/untar.sh
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
find . -maxdepth 1 -type f \( -name "*.tar*" -o -name "*.tgz" -o -name "*.tar.bz2" \) -exec sh -c 'tar -xf "{}" && rm "{}"' \;
|
unet/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6205958296
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08b55f95640f0615bc957b7e0641973220578146f32d1647f900fa74c93f1f4d
|
| 3 |
size 6205958296
|