Spaces:
Sleeping
Sleeping
Add script to split dataset into train and validation sets
Browse filesThis script organizes the balanced seed dataset into the required YOLO folder structure (train and val).
Randomly shuffles and splits the dataset using an 80% Training / 20% Validation ratio.
Automatically routes both images and their corresponding .txt segmentation labels.
Includes a safety fallback to generate empty label files for background/healthy images, preventing YOLO training errors.
split.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import random
|
| 4 |
+
from glob import glob
|
| 5 |
+
|
| 6 |
+
# ================= CONFIGURATION =================
|
| 7 |
+
# Input Folders (Where your 400 images are now)
|
| 8 |
+
SOURCE_IMAGES = r"C:\Users\charu\Desktop\My_Project\seed_images"
|
| 9 |
+
SOURCE_LABELS = r"C:\Users\charu\Desktop\My_Project\seed_labels"
|
| 10 |
+
|
| 11 |
+
# Output Folder (Where the training data will go)
|
| 12 |
+
DEST_DIR = r"C:\Users\charu\Desktop\My_Project\teacher_v2_dataset"
|
| 13 |
+
|
| 14 |
+
# Split Ratio (80% Train, 20% Validation)
|
| 15 |
+
TRAIN_RATIO = 0.8
|
| 16 |
+
# =================================================
|
| 17 |
+
|
| 18 |
+
def split_data():
|
| 19 |
+
# 1. Create Folders
|
| 20 |
+
for split in ['train', 'val']:
|
| 21 |
+
os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
|
| 22 |
+
os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
|
| 23 |
+
|
| 24 |
+
# 2. Get List of All Images
|
| 25 |
+
images = glob(os.path.join(SOURCE_IMAGES, "*.jpg")) + glob(os.path.join(SOURCE_IMAGES, "*.png"))
|
| 26 |
+
random.shuffle(images)
|
| 27 |
+
|
| 28 |
+
total = len(images)
|
| 29 |
+
train_count = int(total * TRAIN_RATIO)
|
| 30 |
+
|
| 31 |
+
print(f"📦 Organizing {total} images...")
|
| 32 |
+
print(f" - Training: {train_count}")
|
| 33 |
+
print(f" - Validation: {total - train_count}")
|
| 34 |
+
|
| 35 |
+
# 3. Move Files
|
| 36 |
+
for i, img_path in enumerate(images):
|
| 37 |
+
filename = os.path.basename(img_path)
|
| 38 |
+
label_name = os.path.splitext(filename)[0] + ".txt"
|
| 39 |
+
label_path = os.path.join(SOURCE_LABELS, label_name)
|
| 40 |
+
|
| 41 |
+
# Decide destination (Train or Val)
|
| 42 |
+
split = 'train' if i < train_count else 'val'
|
| 43 |
+
|
| 44 |
+
# Copy Image
|
| 45 |
+
shutil.copy(img_path, os.path.join(DEST_DIR, split, 'images', filename))
|
| 46 |
+
|
| 47 |
+
# Copy Label (If it exists)
|
| 48 |
+
if os.path.exists(label_path):
|
| 49 |
+
shutil.copy(label_path, os.path.join(DEST_DIR, split, 'labels', label_name))
|
| 50 |
+
else:
|
| 51 |
+
# If no label exists (shouldn't happen, but just in case), create an empty one for "Healthy"
|
| 52 |
+
# This ensures YOLO knows it's a background image
|
| 53 |
+
with open(os.path.join(DEST_DIR, split, 'labels', label_name), 'w') as f:
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
print(f"\n✅ Done! Dataset ready at: {DEST_DIR}")
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
split_data()
|