2511
Browse files- dataset.py +50 -11
- dataset-Copy1.py → dataset_old.py +0 -0
- samples/unet_320x640_0.jpg +2 -2
- samples/unet_384x640_0.jpg +2 -2
- samples/unet_448x640_0.jpg +2 -2
- samples/unet_512x640_0.jpg +2 -2
- samples/unet_576x640_0.jpg +2 -2
- samples/unet_640x320_0.jpg +2 -2
- samples/unet_640x384_0.jpg +2 -2
- samples/unet_640x448_0.jpg +2 -2
- samples/unet_640x512_0.jpg +2 -2
- samples/unet_640x576_0.jpg +2 -2
- samples/unet_640x640_0.jpg +2 -2
- unet/diffusion_pytorch_model.safetensors +1 -1
dataset.py
CHANGED
|
@@ -22,15 +22,15 @@ import tempfile
|
|
| 22 |
# ---------------- 1️⃣ Настройки ----------------
|
| 23 |
dtype = torch.float16
|
| 24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 25 |
-
batch_size =
|
| 26 |
min_size = 320 #192 #256 #192
|
| 27 |
max_size = 640 #384 #256 #384
|
| 28 |
step = 64 #64
|
| 29 |
empty_share = 0.05
|
| 30 |
limit = 0
|
| 31 |
# Основная процедура обработки
|
| 32 |
-
folder_path = "/workspace/
|
| 33 |
-
save_path = "/workspace/
|
| 34 |
dir_tmp = "/workspace/tmp"
|
| 35 |
os.makedirs(save_path, exist_ok=True)
|
| 36 |
|
|
@@ -355,19 +355,58 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
|
|
| 355 |
print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
|
| 356 |
|
| 357 |
# ---------------- 7️⃣ Объединение ----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
def combine_chunks(temp_path, final_path):
|
| 359 |
-
archives = sorted(
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
tmp = tempfile.mkdtemp(dir=dir_tmp)
|
| 364 |
-
os.makedirs(tmp, exist_ok=True)
|
| 365 |
subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
|
|
|
|
|
|
|
| 366 |
ds = load_from_disk(tmp)
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
shutil.rmtree(tmp)
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
print(f"✅ Датасет сохранён: {final_path}")
|
| 372 |
|
| 373 |
# Создаем временную папку для чанков
|
|
|
|
| 22 |
# ---------------- 1️⃣ Настройки ----------------
|
| 23 |
dtype = torch.float16
|
| 24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 25 |
+
batch_size = 10
|
| 26 |
min_size = 320 #192 #256 #192
|
| 27 |
max_size = 640 #384 #256 #384
|
| 28 |
step = 64 #64
|
| 29 |
empty_share = 0.05
|
| 30 |
limit = 0
|
| 31 |
# Основная процедура обработки
|
| 32 |
+
folder_path = "/workspace/tar" #alchemist"
|
| 33 |
+
save_path = "/workspace/640" #"alchemist"
|
| 34 |
dir_tmp = "/workspace/tmp"
|
| 35 |
os.makedirs(save_path, exist_ok=True)
|
| 36 |
|
|
|
|
| 355 |
print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
|
| 356 |
|
| 357 |
# ---------------- 7️⃣ Объединение ----------------
|
| 358 |
+
def safe_repack(ds):
|
| 359 |
+
"""
|
| 360 |
+
Перепаковка датасета в маленькие .arrow файлы.
|
| 361 |
+
ЧТО ДЕЛАЕТ:
|
| 362 |
+
- уменьшает Arrow chunk size
|
| 363 |
+
- writer_batch_size=1000 → Arrow 30–60 MB
|
| 364 |
+
"""
|
| 365 |
+
return ds.map(lambda x: x, batched=True, writer_batch_size=1000)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
def combine_chunks(temp_path, final_path):
|
| 369 |
+
archives = sorted(
|
| 370 |
+
f for f in os.listdir(temp_path)
|
| 371 |
+
if f.endswith(".tar.zst")
|
| 372 |
+
)
|
| 373 |
+
archives = [os.path.join(temp_path, f) for f in archives]
|
| 374 |
+
|
| 375 |
+
print(f"Найдено {len(archives)} архивов.")
|
| 376 |
+
|
| 377 |
+
# Инициализируем пустой датасет
|
| 378 |
+
merged_ds = None
|
| 379 |
+
|
| 380 |
+
for i, arc in enumerate(archives):
|
| 381 |
+
print(f"[{i+1}/{len(archives)}] Обрабатываю {arc}")
|
| 382 |
+
|
| 383 |
+
# Распаковка
|
| 384 |
tmp = tempfile.mkdtemp(dir=dir_tmp)
|
|
|
|
| 385 |
subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
|
| 386 |
+
|
| 387 |
+
# Загрузка датасета
|
| 388 |
ds = load_from_disk(tmp)
|
| 389 |
+
|
| 390 |
+
# Перепаковка чанк-датасета, чтобы уменьшить Arrow-файлы
|
| 391 |
+
ds = safe_repack(ds)
|
| 392 |
+
|
| 393 |
+
# Мерж
|
| 394 |
+
if merged_ds is None:
|
| 395 |
+
merged_ds = ds
|
| 396 |
+
else:
|
| 397 |
+
merged_ds = concatenate_datasets([merged_ds, ds])
|
| 398 |
+
|
| 399 |
+
# cleanup
|
| 400 |
shutil.rmtree(tmp)
|
| 401 |
+
os.remove(arc)
|
| 402 |
+
|
| 403 |
+
# Финальная перепаковка
|
| 404 |
+
print("⚙️ Финальная перепаковка...")
|
| 405 |
+
merged_ds = safe_repack(merged_ds)
|
| 406 |
+
|
| 407 |
+
print("💾 Финальное сохранение...")
|
| 408 |
+
merged_ds.save_to_disk(final_path)
|
| 409 |
+
|
| 410 |
print(f"✅ Датасет сохранён: {final_path}")
|
| 411 |
|
| 412 |
# Создаем временную папку для чанков
|
dataset-Copy1.py → dataset_old.py
RENAMED
|
File without changes
|
samples/unet_320x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_384x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_448x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_512x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_576x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x320_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x384_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x448_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x512_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x576_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
unet/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6184944280
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a986a5cef456e178da40d574d94ade01ee5d9489d1540f318bcf9f696fc658d3
|
| 3 |
size 6184944280
|