diatom-classifier / src /split_yolo.py
kemalbsoylu's picture
build YOLOv8 data pipeline and configure diatom detector
562cbcf
"""
Splits the YOLO dataset into 80% training and 20% validation sets.
"""
import random
import shutil
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO, format="%(message)s")
def split_dataset():
yolo_base = Path("data/yolo_dataset")
img_dir = yolo_base / "images"
lbl_dir = yolo_base / "labels"
# Create standard YOLO split directories
for split in ['train', 'val']:
(img_dir / split).mkdir(exist_ok=True)
(lbl_dir / split).mkdir(exist_ok=True)
# Get all images (ignoring the train/val folders we just made)
images = [f for f in img_dir.glob("*.*") if f.is_file()]
# Shuffle for randomness, then calculate the 80% split index
random.seed(42)
random.shuffle(images)
split_idx = int(len(images) * 0.8)
train_imgs = images[:split_idx]
val_imgs = images[split_idx:]
def move_files(file_list, split_name):
for img_path in file_list:
# Move Image
shutil.move(str(img_path), str(img_dir / split_name / img_path.name))
# Find and move corresponding label
lbl_name = img_path.stem + ".txt"
lbl_path = lbl_dir / lbl_name
if lbl_path.exists():
shutil.move(str(lbl_path), str(lbl_dir / split_name / lbl_name))
logging.info(f"Splitting dataset: {len(train_imgs)} Train | {len(val_imgs)} Val")
move_files(train_imgs, 'train')
move_files(val_imgs, 'val')
logging.info("✅ Dataset successfully split!")
if __name__ == "__main__":
split_dataset()