Charuka66 commited on
Commit
5bbbbc0
·
verified ·
1 Parent(s): b730b20

Add script to split dataset into train and validation sets

Browse files

This script organizes the balanced seed dataset into the required YOLO folder structure (train and val).
Randomly shuffles and splits the dataset using an 80% Training / 20% Validation ratio.
Automatically routes both images and their corresponding .txt segmentation labels.
Includes a safety fallback to generate empty label files for background/healthy images, preventing YOLO training errors.

Files changed (1) hide show
  1. split.py +59 -0
split.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import random
4
+ from glob import glob
5
+
6
+ # ================= CONFIGURATION =================
7
+ # Input Folders (Where your 400 images are now)
8
+ SOURCE_IMAGES = r"C:\Users\charu\Desktop\My_Project\seed_images"
9
+ SOURCE_LABELS = r"C:\Users\charu\Desktop\My_Project\seed_labels"
10
+
11
+ # Output Folder (Where the training data will go)
12
+ DEST_DIR = r"C:\Users\charu\Desktop\My_Project\teacher_v2_dataset"
13
+
14
+ # Split Ratio (80% Train, 20% Validation)
15
+ TRAIN_RATIO = 0.8
16
+ # =================================================
17
+
18
+ def split_data():
19
+ # 1. Create Folders
20
+ for split in ['train', 'val']:
21
+ os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
22
+ os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
23
+
24
+ # 2. Get List of All Images
25
+ images = glob(os.path.join(SOURCE_IMAGES, "*.jpg")) + glob(os.path.join(SOURCE_IMAGES, "*.png"))
26
+ random.shuffle(images)
27
+
28
+ total = len(images)
29
+ train_count = int(total * TRAIN_RATIO)
30
+
31
+ print(f"📦 Organizing {total} images...")
32
+ print(f" - Training: {train_count}")
33
+ print(f" - Validation: {total - train_count}")
34
+
35
+ # 3. Move Files
36
+ for i, img_path in enumerate(images):
37
+ filename = os.path.basename(img_path)
38
+ label_name = os.path.splitext(filename)[0] + ".txt"
39
+ label_path = os.path.join(SOURCE_LABELS, label_name)
40
+
41
+ # Decide destination (Train or Val)
42
+ split = 'train' if i < train_count else 'val'
43
+
44
+ # Copy Image
45
+ shutil.copy(img_path, os.path.join(DEST_DIR, split, 'images', filename))
46
+
47
+ # Copy Label (If it exists)
48
+ if os.path.exists(label_path):
49
+ shutil.copy(label_path, os.path.join(DEST_DIR, split, 'labels', label_name))
50
+ else:
51
+ # If no label exists (shouldn't happen, but just in case), create an empty one for "Healthy"
52
+ # This ensures YOLO knows it's a background image
53
+ with open(os.path.join(DEST_DIR, split, 'labels', label_name), 'w') as f:
54
+ pass
55
+
56
+ print(f"\n✅ Done! Dataset ready at: {DEST_DIR}")
57
+
58
+ if __name__ == "__main__":
59
+ split_data()