|
|
|
|
|
import sys |
|
|
import hashlib |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
from wsgiref.validate import assert_ |
|
|
|
|
|
from ultralytics.data.converter import convert_coco |
|
|
import os |
|
|
|
|
|
|
|
|
INPUT_DIR = "data/coco" |
|
|
YOLO_DATA_DIR = "data/yolo" |
|
|
TARGET_DIR = "data/yolo_split" |
|
|
|
|
|
|
|
|
|
|
|
def md5_of_file(path, chunk=1<<20): |
|
|
h = hashlib.md5() |
|
|
with open(path, "rb") as f: |
|
|
while True: |
|
|
b = f.read(chunk) |
|
|
if not b: |
|
|
break |
|
|
h.update(b) |
|
|
return h.hexdigest() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if os.path.isdir(YOLO_DATA_DIR): |
|
|
shutil.rmtree(YOLO_DATA_DIR) |
|
|
|
|
|
if os.path.isdir(TARGET_DIR): |
|
|
shutil.rmtree(TARGET_DIR) |
|
|
|
|
|
|
|
|
convert_coco(labels_dir="data/coco", use_segments=True, save_dir=YOLO_DATA_DIR ) |
|
|
|
|
|
train_dir = os.path.join(TARGET_DIR,"images", "train") |
|
|
train_dir_annotations = os.path.join(TARGET_DIR,"labels", "train") |
|
|
val_dir = os.path.join(TARGET_DIR, "images","val") |
|
|
val_dir_annotations = os.path.join(TARGET_DIR, "labels","val") |
|
|
|
|
|
os.makedirs(train_dir, exist_ok=True) |
|
|
os.makedirs(train_dir_annotations, exist_ok=True) |
|
|
os.makedirs(val_dir, exist_ok=True) |
|
|
os.makedirs(val_dir_annotations, exist_ok=True) |
|
|
|
|
|
files = list(Path(INPUT_DIR).rglob("*.jpg")) |
|
|
print(f"Found {len(files)} .jpg files") |
|
|
|
|
|
n_train = n_val = 0 |
|
|
label_dir = os.path.join(YOLO_DATA_DIR, "labels", "default") |
|
|
|
|
|
for f in files: |
|
|
md5 = md5_of_file(f) |
|
|
bucket = int(md5, 16) % 100 |
|
|
|
|
|
label_basename = os.path.splitext(f.name)[0] + ".txt" |
|
|
label_file = os.path.join(label_dir, label_basename) |
|
|
if not os.path.exists(label_file): |
|
|
continue |
|
|
|
|
|
if bucket < 20: |
|
|
dst = os.path.join(val_dir, f.name) |
|
|
shutil.copy2(f, dst) |
|
|
dst = os.path.join(val_dir_annotations, label_basename) |
|
|
n_val += 1 |
|
|
else: |
|
|
dst = os.path.join(train_dir, f.name) |
|
|
shutil.copy2(f, dst) |
|
|
dst = os.path.join(train_dir_annotations, label_basename) |
|
|
n_train += 1 |
|
|
|
|
|
lines = [] |
|
|
with open(label_file, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
parts = line.strip().split() |
|
|
if not parts: |
|
|
continue |
|
|
if parts[0] == "1": |
|
|
parts[0] = "0" |
|
|
lines.append(" ".join(parts)) |
|
|
|
|
|
with open(dst, "w", encoding="utf-8") as f: |
|
|
f.write("\n".join(lines)) |
|
|
|
|
|
print(f"Copied {n_train} files to train/, {n_val} files to val/") |
|
|
|