recoilme commited on Nov 14, 2025

Commit

d26cd38

1 Parent(s): 844bc36

2511

Browse files

Files changed (14) hide show

dataset.py +50 -11
dataset-Copy1.py → dataset_old.py +0 -0
samples/unet_320x640_0.jpg +2 -2
samples/unet_384x640_0.jpg +2 -2
samples/unet_448x640_0.jpg +2 -2
samples/unet_512x640_0.jpg +2 -2
samples/unet_576x640_0.jpg +2 -2
samples/unet_640x320_0.jpg +2 -2
samples/unet_640x384_0.jpg +2 -2
samples/unet_640x448_0.jpg +2 -2
samples/unet_640x512_0.jpg +2 -2
samples/unet_640x576_0.jpg +2 -2
samples/unet_640x640_0.jpg +2 -2
unet/diffusion_pytorch_model.safetensors +1 -1

dataset.py CHANGED Viewed

@@ -22,15 +22,15 @@ import tempfile
 # ---------------- 1️⃣ Настройки ----------------
 dtype = torch.float16
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-batch_size = 5
 min_size = 320 #192 #256 #192
 max_size = 640 #384 #256 #384
 step = 64 #64
 empty_share = 0.05
 limit = 0
 # Основная процедура обработки
-folder_path = "/workspace/1" #alchemist"
-save_path = "/workspace/1_640" #"alchemist"
 dir_tmp = "/workspace/tmp"
 os.makedirs(save_path, exist_ok=True)
@@ -355,19 +355,58 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
                 print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
 # ---------------- 7️⃣ Объединение ----------------
 def combine_chunks(temp_path, final_path):
-    archives = sorted([os.path.join(temp_path, f) for f in os.listdir(temp_path) if f.endswith(".tar.zst")])
-    datasets = []
-    for arc in archives:
-        #tmp = tempfile.mkdtemp()
         tmp = tempfile.mkdtemp(dir=dir_tmp)
-        os.makedirs(tmp, exist_ok=True)
         subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
         ds = load_from_disk(tmp)
-        datasets.append(ds)
         shutil.rmtree(tmp)
-    combined = concatenate_datasets(datasets)
-    combined.save_to_disk(final_path)
     print(f"✅ Датасет сохранён: {final_path}")
 # Создаем временную папку для чанков

 # ---------------- 1️⃣ Настройки ----------------
 dtype = torch.float16
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+batch_size = 10
 min_size = 320 #192 #256 #192
 max_size = 640 #384 #256 #384
 step = 64 #64
 empty_share = 0.05
 limit = 0
 # Основная процедура обработки
+folder_path = "/workspace/tar" #alchemist"
+save_path = "/workspace/640" #"alchemist"
 dir_tmp = "/workspace/tmp"
 os.makedirs(save_path, exist_ok=True)
                 print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
 # ---------------- 7️⃣ Объединение ----------------
+def safe_repack(ds):
+    """
+    Перепаковка датасета в маленькие .arrow файлы.
+    ЧТО ДЕЛАЕТ:
+      - уменьшает Arrow chunk size
+      - writer_batch_size=1000 → Arrow 30–60 MB
+    """
+    return ds.map(lambda x: x, batched=True, writer_batch_size=1000)
 def combine_chunks(temp_path, final_path):
+    archives = sorted(
+        f for f in os.listdir(temp_path)
+        if f.endswith(".tar.zst")
+    )
+    archives = [os.path.join(temp_path, f) for f in archives]
+    print(f"Найдено {len(archives)} архивов.")
+    # Инициализируем пустой датасет
+    merged_ds = None
+    for i, arc in enumerate(archives):
+        print(f"[{i+1}/{len(archives)}] Обрабатываю {arc}")
+        # Распаковка
         tmp = tempfile.mkdtemp(dir=dir_tmp)
         subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
+        # Загрузка датасета
         ds = load_from_disk(tmp)
+        # Перепаковка чанк-датасета, чтобы уменьшить Arrow-файлы
+        ds = safe_repack(ds)
+        # Мерж
+        if merged_ds is None:
+            merged_ds = ds
+        else:
+            merged_ds = concatenate_datasets([merged_ds, ds])
+        # cleanup
         shutil.rmtree(tmp)
+        os.remove(arc)
+    # Финальная перепаковка
+    print("⚙️ Финальная перепаковка...")
+    merged_ds = safe_repack(merged_ds)
+    print("💾 Финальное сохранение...")
+    merged_ds.save_to_disk(final_path)
     print(f"✅ Датасет сохранён: {final_path}")
 # Создаем временную папку для чанков

dataset-Copy1.py → dataset_old.py RENAMED Viewed

File without changes

samples/unet_320x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: b8b2a6d57d614269e127795de56b63f26f0726a54a09b8bb344775de06bc0f76
Pointer size: 130 Bytes
Size of remote file: 58.4 kB

Git LFS Details

SHA256: 3f267b0ae5941e62b043cd4d72e92593fed3c7fee5310b9aea4fb3248a3860ac
Pointer size: 130 Bytes
Size of remote file: 58 kB

samples/unet_384x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 7e2773f4f8051fb96c14308422909f2ef45517382fcf86f378ca319c52e94f6c
Pointer size: 131 Bytes
Size of remote file: 114 kB

Git LFS Details

SHA256: 7837ea426176d0e45a89e5244a67da6e1c735f650bc882ccb3b5b9d580b88b0a
Pointer size: 131 Bytes
Size of remote file: 124 kB

samples/unet_448x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 3267e1d0eaf7de085a7827d0c6276a8d9eac4215d1a0bd4a58b9e898ab445423
Pointer size: 130 Bytes
Size of remote file: 84.3 kB

Git LFS Details

SHA256: f19698756eef51e631235f8ec6407115e08c7b56807bd914f682645d56afce4f
Pointer size: 130 Bytes
Size of remote file: 85.4 kB

samples/unet_512x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 2adb69f6030bb412db145e5e6e1306ff2ac94fa42a42e5126425e43a3c34d25c
Pointer size: 131 Bytes
Size of remote file: 193 kB

Git LFS Details

SHA256: ad292a0fa0a0fb8c66cd27d2f4dc0c91bc13f5e8c6189dd32cdf20848e03b61f
Pointer size: 131 Bytes
Size of remote file: 157 kB

samples/unet_576x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 1b0f3b1837131938a54a5d80c02e0ef9a4f759112b64460879e81de1e44c618a
Pointer size: 131 Bytes
Size of remote file: 138 kB

Git LFS Details

SHA256: 83f83e9ea1a73c1eddeb5fc2ed6d6291f73f219b085d69ffa2cfa691171cd372
Pointer size: 131 Bytes
Size of remote file: 153 kB

samples/unet_640x320_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 5fbb49016141429ab770ec16bbbd5678e15a389f913ccd12c64be92efd1d8444
Pointer size: 130 Bytes
Size of remote file: 44.9 kB

Git LFS Details

SHA256: 61203c9f2bb864e35c7f6c0f9bf5430342775fae0cc91fcde485420b410c47fd
Pointer size: 130 Bytes
Size of remote file: 54.3 kB

samples/unet_640x384_0.jpg CHANGED Viewed

Git LFS Details

SHA256: e6bdf332cfe144b9025232ed17b93449fd9d544199f18ef59d52ff76eef2a19c
Pointer size: 130 Bytes
Size of remote file: 85 kB

Git LFS Details

SHA256: f38388b67e6ad0a8832900cb7a9c75e0ccd405f666e0a760e66ce2589ca3e4e3
Pointer size: 130 Bytes
Size of remote file: 99.1 kB

samples/unet_640x448_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 55d2d0dc184988a72ebc9d5ac9a6aaeabde68a1f931c1c3fec9e2d3dcb4aae51
Pointer size: 131 Bytes
Size of remote file: 115 kB

Git LFS Details

SHA256: c3a01e116d95aabfadd1cb9f0deb0cd32bd724594a1e5ea0c17d3ae3423a63ab
Pointer size: 131 Bytes
Size of remote file: 116 kB

samples/unet_640x512_0.jpg CHANGED Viewed

Git LFS Details

SHA256: c087c28170c148c4633d7e47542fe7e1d9bc482fa921876867e0e331ee61e22a
Pointer size: 131 Bytes
Size of remote file: 252 kB

Git LFS Details

SHA256: d8068c3d3d092d1c2e07a05680c88eaa7b9560c65e8110b9a12c6e1e6c3690b0
Pointer size: 131 Bytes
Size of remote file: 228 kB

samples/unet_640x576_0.jpg CHANGED Viewed

Git LFS Details

SHA256: b155366845a9069bbb203106ee6ac7bce987f040635ad97d7687bd7c9a933ea7
Pointer size: 130 Bytes
Size of remote file: 71.7 kB

Git LFS Details

SHA256: fc8b3f504bdb564911bbec9023ae7e8094ba76d6a0965d99345d1104d96b3c03
Pointer size: 130 Bytes
Size of remote file: 83.6 kB

samples/unet_640x640_0.jpg CHANGED Viewed

Git LFS Details

SHA256: 222a0c74939078bb2b6324e50f893a087f313ebd64860d34fa73cf9293f692df
Pointer size: 131 Bytes
Size of remote file: 260 kB

Git LFS Details

SHA256: 3c81b2359d49108da655f33ee6ddca1e3b63b10ea2b952292d72e82655d1db07
Pointer size: 131 Bytes
Size of remote file: 255 kB

unet/diffusion_pytorch_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7226baa7a6111c3ace4e7692400c546296e01273c4d946c1d13753454c2d359
 size 6184944280

 version https://git-lfs.github.com/spec/v1
+oid sha256:a986a5cef456e178da40d574d94ade01ee5d9489d1540f318bcf9f696fc658d3
 size 6184944280