recoilme commited on
Commit
d26cd38
·
1 Parent(s): 844bc36
dataset.py CHANGED
@@ -22,15 +22,15 @@ import tempfile
22
  # ---------------- 1️⃣ Настройки ----------------
23
  dtype = torch.float16
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
- batch_size = 5
26
  min_size = 320 #192 #256 #192
27
  max_size = 640 #384 #256 #384
28
  step = 64 #64
29
  empty_share = 0.05
30
  limit = 0
31
  # Основная процедура обработки
32
- folder_path = "/workspace/1" #alchemist"
33
- save_path = "/workspace/1_640" #"alchemist"
34
  dir_tmp = "/workspace/tmp"
35
  os.makedirs(save_path, exist_ok=True)
36
 
@@ -355,19 +355,58 @@ def process_in_chunks(image_paths, text_paths, width, height, chunk_size=10000,
355
  print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
356
 
357
  # ---------------- 7️⃣ Объединение ----------------
 
 
 
 
 
 
 
 
 
 
358
  def combine_chunks(temp_path, final_path):
359
- archives = sorted([os.path.join(temp_path, f) for f in os.listdir(temp_path) if f.endswith(".tar.zst")])
360
- datasets = []
361
- for arc in archives:
362
- #tmp = tempfile.mkdtemp()
 
 
 
 
 
 
 
 
 
 
 
363
  tmp = tempfile.mkdtemp(dir=dir_tmp)
364
- os.makedirs(tmp, exist_ok=True)
365
  subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
 
 
366
  ds = load_from_disk(tmp)
367
- datasets.append(ds)
 
 
 
 
 
 
 
 
 
 
368
  shutil.rmtree(tmp)
369
- combined = concatenate_datasets(datasets)
370
- combined.save_to_disk(final_path)
 
 
 
 
 
 
 
371
  print(f"✅ Датасет сохранён: {final_path}")
372
 
373
  # Создаем временную папку для чанков
 
22
  # ---------------- 1️⃣ Настройки ----------------
23
  dtype = torch.float16
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ batch_size = 10
26
  min_size = 320 #192 #256 #192
27
  max_size = 640 #384 #256 #384
28
  step = 64 #64
29
  empty_share = 0.05
30
  limit = 0
31
  # Основная процедура обработки
32
+ folder_path = "/workspace/tar" #alchemist"
33
+ save_path = "/workspace/640" #"alchemist"
34
  dir_tmp = "/workspace/tmp"
35
  os.makedirs(save_path, exist_ok=True)
36
 
 
355
  print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
356
 
357
  # ---------------- 7️⃣ Объединение ----------------
358
+ def safe_repack(ds):
359
+ """
360
+ Перепаковка датасета в маленькие .arrow файлы.
361
+ ЧТО ДЕЛАЕТ:
362
+ - уменьшает Arrow chunk size
363
+ - writer_batch_size=1000 → Arrow 30–60 MB
364
+ """
365
+ return ds.map(lambda x: x, batched=True, writer_batch_size=1000)
366
+
367
+
368
  def combine_chunks(temp_path, final_path):
369
+ archives = sorted(
370
+ f for f in os.listdir(temp_path)
371
+ if f.endswith(".tar.zst")
372
+ )
373
+ archives = [os.path.join(temp_path, f) for f in archives]
374
+
375
+ print(f"Найдено {len(archives)} архивов.")
376
+
377
+ # Инициализируем пустой датасет
378
+ merged_ds = None
379
+
380
+ for i, arc in enumerate(archives):
381
+ print(f"[{i+1}/{len(archives)}] Обрабатываю {arc}")
382
+
383
+ # Распаковка
384
  tmp = tempfile.mkdtemp(dir=dir_tmp)
 
385
  subprocess.run(["tar", "-I", "zstd", "-xf", arc, "-C", tmp], check=True)
386
+
387
+ # Загрузка датасета
388
  ds = load_from_disk(tmp)
389
+
390
+ # Перепаковка чанк-датасета, чтобы уменьшить Arrow-файлы
391
+ ds = safe_repack(ds)
392
+
393
+ # Мерж
394
+ if merged_ds is None:
395
+ merged_ds = ds
396
+ else:
397
+ merged_ds = concatenate_datasets([merged_ds, ds])
398
+
399
+ # cleanup
400
  shutil.rmtree(tmp)
401
+ os.remove(arc)
402
+
403
+ # Финальная перепаковка
404
+ print("⚙️ Финальная перепаковка...")
405
+ merged_ds = safe_repack(merged_ds)
406
+
407
+ print("💾 Финальное сохранение...")
408
+ merged_ds.save_to_disk(final_path)
409
+
410
  print(f"✅ Датасет сохранён: {final_path}")
411
 
412
  # Создаем временную папку для чанков
dataset-Copy1.py → dataset_old.py RENAMED
File without changes
samples/unet_320x640_0.jpg CHANGED

Git LFS Details

  • SHA256: b8b2a6d57d614269e127795de56b63f26f0726a54a09b8bb344775de06bc0f76
  • Pointer size: 130 Bytes
  • Size of remote file: 58.4 kB

Git LFS Details

  • SHA256: 3f267b0ae5941e62b043cd4d72e92593fed3c7fee5310b9aea4fb3248a3860ac
  • Pointer size: 130 Bytes
  • Size of remote file: 58 kB
samples/unet_384x640_0.jpg CHANGED

Git LFS Details

  • SHA256: 7e2773f4f8051fb96c14308422909f2ef45517382fcf86f378ca319c52e94f6c
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB

Git LFS Details

  • SHA256: 7837ea426176d0e45a89e5244a67da6e1c735f650bc882ccb3b5b9d580b88b0a
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
samples/unet_448x640_0.jpg CHANGED

Git LFS Details

  • SHA256: 3267e1d0eaf7de085a7827d0c6276a8d9eac4215d1a0bd4a58b9e898ab445423
  • Pointer size: 130 Bytes
  • Size of remote file: 84.3 kB

Git LFS Details

  • SHA256: f19698756eef51e631235f8ec6407115e08c7b56807bd914f682645d56afce4f
  • Pointer size: 130 Bytes
  • Size of remote file: 85.4 kB
samples/unet_512x640_0.jpg CHANGED

Git LFS Details

  • SHA256: 2adb69f6030bb412db145e5e6e1306ff2ac94fa42a42e5126425e43a3c34d25c
  • Pointer size: 131 Bytes
  • Size of remote file: 193 kB

Git LFS Details

  • SHA256: ad292a0fa0a0fb8c66cd27d2f4dc0c91bc13f5e8c6189dd32cdf20848e03b61f
  • Pointer size: 131 Bytes
  • Size of remote file: 157 kB
samples/unet_576x640_0.jpg CHANGED

Git LFS Details

  • SHA256: 1b0f3b1837131938a54a5d80c02e0ef9a4f759112b64460879e81de1e44c618a
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB

Git LFS Details

  • SHA256: 83f83e9ea1a73c1eddeb5fc2ed6d6291f73f219b085d69ffa2cfa691171cd372
  • Pointer size: 131 Bytes
  • Size of remote file: 153 kB
samples/unet_640x320_0.jpg CHANGED

Git LFS Details

  • SHA256: 5fbb49016141429ab770ec16bbbd5678e15a389f913ccd12c64be92efd1d8444
  • Pointer size: 130 Bytes
  • Size of remote file: 44.9 kB

Git LFS Details

  • SHA256: 61203c9f2bb864e35c7f6c0f9bf5430342775fae0cc91fcde485420b410c47fd
  • Pointer size: 130 Bytes
  • Size of remote file: 54.3 kB
samples/unet_640x384_0.jpg CHANGED

Git LFS Details

  • SHA256: e6bdf332cfe144b9025232ed17b93449fd9d544199f18ef59d52ff76eef2a19c
  • Pointer size: 130 Bytes
  • Size of remote file: 85 kB

Git LFS Details

  • SHA256: f38388b67e6ad0a8832900cb7a9c75e0ccd405f666e0a760e66ce2589ca3e4e3
  • Pointer size: 130 Bytes
  • Size of remote file: 99.1 kB
samples/unet_640x448_0.jpg CHANGED

Git LFS Details

  • SHA256: 55d2d0dc184988a72ebc9d5ac9a6aaeabde68a1f931c1c3fec9e2d3dcb4aae51
  • Pointer size: 131 Bytes
  • Size of remote file: 115 kB

Git LFS Details

  • SHA256: c3a01e116d95aabfadd1cb9f0deb0cd32bd724594a1e5ea0c17d3ae3423a63ab
  • Pointer size: 131 Bytes
  • Size of remote file: 116 kB
samples/unet_640x512_0.jpg CHANGED

Git LFS Details

  • SHA256: c087c28170c148c4633d7e47542fe7e1d9bc482fa921876867e0e331ee61e22a
  • Pointer size: 131 Bytes
  • Size of remote file: 252 kB

Git LFS Details

  • SHA256: d8068c3d3d092d1c2e07a05680c88eaa7b9560c65e8110b9a12c6e1e6c3690b0
  • Pointer size: 131 Bytes
  • Size of remote file: 228 kB
samples/unet_640x576_0.jpg CHANGED

Git LFS Details

  • SHA256: b155366845a9069bbb203106ee6ac7bce987f040635ad97d7687bd7c9a933ea7
  • Pointer size: 130 Bytes
  • Size of remote file: 71.7 kB

Git LFS Details

  • SHA256: fc8b3f504bdb564911bbec9023ae7e8094ba76d6a0965d99345d1104d96b3c03
  • Pointer size: 130 Bytes
  • Size of remote file: 83.6 kB
samples/unet_640x640_0.jpg CHANGED

Git LFS Details

  • SHA256: 222a0c74939078bb2b6324e50f893a087f313ebd64860d34fa73cf9293f692df
  • Pointer size: 131 Bytes
  • Size of remote file: 260 kB

Git LFS Details

  • SHA256: 3c81b2359d49108da655f33ee6ddca1e3b63b10ea2b952292d72e82655d1db07
  • Pointer size: 131 Bytes
  • Size of remote file: 255 kB
unet/diffusion_pytorch_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7226baa7a6111c3ace4e7692400c546296e01273c4d946c1d13753454c2d359
3
  size 6184944280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a986a5cef456e178da40d574d94ade01ee5d9489d1540f318bcf9f696fc658d3
3
  size 6184944280