import sys import hashlib import shutil from pathlib import Path from wsgiref.validate import assert_ from ultralytics.data.converter import convert_coco import os INPUT_DIR = "data/coco" YOLO_DATA_DIR = "data/yolo" TARGET_DIR = "data/yolo_split" def md5_of_file(path, chunk=1<<20): h = hashlib.md5() with open(path, "rb") as f: while True: b = f.read(chunk) if not b: break h.update(b) return h.hexdigest() if __name__ == "__main__": if os.path.isdir(YOLO_DATA_DIR): shutil.rmtree(YOLO_DATA_DIR) if os.path.isdir(TARGET_DIR): shutil.rmtree(TARGET_DIR) convert_coco(labels_dir="data/coco", use_segments=True, save_dir=YOLO_DATA_DIR ) train_dir = os.path.join(TARGET_DIR,"images", "train") train_dir_annotations = os.path.join(TARGET_DIR,"labels", "train") val_dir = os.path.join(TARGET_DIR, "images","val") val_dir_annotations = os.path.join(TARGET_DIR, "labels","val") os.makedirs(train_dir, exist_ok=True) os.makedirs(train_dir_annotations, exist_ok=True) os.makedirs(val_dir, exist_ok=True) os.makedirs(val_dir_annotations, exist_ok=True) files = list(Path(INPUT_DIR).rglob("*.jpg")) print(f"Found {len(files)} .jpg files") n_train = n_val = 0 label_dir = os.path.join(YOLO_DATA_DIR, "labels", "default") for f in files: md5 = md5_of_file(f) bucket = int(md5, 16) % 100 # label_file = os.path.join(label_dir, os.path.splitext(f.name)[0]) + ".txt") label_basename = os.path.splitext(f.name)[0] + ".txt" label_file = os.path.join(label_dir, label_basename) if not os.path.exists(label_file): continue if bucket < 20: dst = os.path.join(val_dir, f.name) shutil.copy2(f, dst) dst = os.path.join(val_dir_annotations, label_basename) n_val += 1 else: dst = os.path.join(train_dir, f.name) shutil.copy2(f, dst) dst = os.path.join(train_dir_annotations, label_basename) n_train += 1 lines = [] with open(label_file, "r", encoding="utf-8") as f: for line in f: parts = line.strip().split() if not parts: continue if parts[0] == "1": parts[0] = "0" lines.append(" ".join(parts)) with open(dst, "w", encoding="utf-8") as f: f.write("\n".join(lines)) print(f"Copied {n_train} files to train/, {n_val} files to val/")