Charuka66 commited on
Commit
c51e7bb
Β·
verified Β·
1 Parent(s): 96b923d

Update dataset splitting script for the 900-image dataset

Browse files

This script organizes the newly augmented 900-image dataset into the standard YOLO directory structure (train and val).
Randomly shuffles and splits the data using an 80% Training / 20% Validation ratio.
Safely maps and copies both images (.jpg, .png, .jpeg) and their corresponding .txt segmentation labels to their respective destination folders.
Includes error handling to flag and log any missing label files during the transfer process to ensure dataset integrity.

Files changed (1) hide show
  1. split.py +49 -32
split.py CHANGED
@@ -2,58 +2,75 @@ import os
2
  import shutil
3
  import random
4
  from glob import glob
 
5
 
6
  # ================= CONFIGURATION =================
7
- # Input Folders (Where your 400 images are now)
8
- SOURCE_IMAGES = r"C:\Users\charu\Desktop\My_Project\seed_images"
9
- SOURCE_LABELS = r"C:\Users\charu\Desktop\My_Project\seed_labels"
 
10
 
11
- # Output Folder (Where the training data will go)
12
- DEST_DIR = r"C:\Users\charu\Desktop\My_Project\teacher_v2_dataset"
13
 
14
- # Split Ratio (80% Train, 20% Validation)
15
  TRAIN_RATIO = 0.8
16
  # =================================================
17
 
18
- def split_data():
19
- # 1. Create Folders
20
  for split in ['train', 'val']:
21
  os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
22
  os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
23
 
24
- # 2. Get List of All Images
25
- images = glob(os.path.join(SOURCE_IMAGES, "*.jpg")) + glob(os.path.join(SOURCE_IMAGES, "*.png"))
26
- random.shuffle(images)
 
 
 
27
 
28
- total = len(images)
29
- train_count = int(total * TRAIN_RATIO)
30
 
31
- print(f"πŸ“¦ Organizing {total} images...")
32
- print(f" - Training: {train_count}")
33
- print(f" - Validation: {total - train_count}")
 
 
 
 
 
 
 
34
 
35
- # 3. Move Files
36
- for i, img_path in enumerate(images):
 
 
 
 
37
  filename = os.path.basename(img_path)
38
- label_name = os.path.splitext(filename)[0] + ".txt"
39
- label_path = os.path.join(SOURCE_LABELS, label_name)
40
 
41
- # Decide destination (Train or Val)
42
- split = 'train' if i < train_count else 'val'
 
 
 
 
43
 
44
  # Copy Image
45
- shutil.copy(img_path, os.path.join(DEST_DIR, split, 'images', filename))
46
 
47
- # Copy Label (If it exists)
48
- if os.path.exists(label_path):
49
- shutil.copy(label_path, os.path.join(DEST_DIR, split, 'labels', label_name))
50
  else:
51
- # If no label exists (shouldn't happen, but just in case), create an empty one for "Healthy"
52
- # This ensures YOLO knows it's a background image
53
- with open(os.path.join(DEST_DIR, split, 'labels', label_name), 'w') as f:
54
- pass
55
 
56
- print(f"\nβœ… Done! Dataset ready at: {DEST_DIR}")
 
57
 
58
  if __name__ == "__main__":
59
- split_data()
 
2
  import shutil
3
  import random
4
  from glob import glob
5
+ from tqdm import tqdm
6
 
7
  # ================= CONFIGURATION =================
8
+ # 1. INPUT: Your current folder with images/ and labels/
9
+ SOURCE_ROOT = r"C:\Users\charu\Desktop\04-02-2026"
10
+ SOURCE_IMAGES = os.path.join(SOURCE_ROOT, "images")
11
+ SOURCE_LABELS = os.path.join(SOURCE_ROOT, "labels")
12
 
13
+ # 2. OUTPUT: Where the ready-to-train data will go
14
+ DEST_DIR = r"C:\Users\charu\Desktop\04-02-2026\final_split_dataset"
15
 
16
+ # 3. Split Ratio (80% Train, 20% Validation)
17
  TRAIN_RATIO = 0.8
18
  # =================================================
19
 
20
+ def split_dataset():
21
+ # 1. Create Destination Folders
22
  for split in ['train', 'val']:
23
  os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
24
  os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
25
 
26
+ # 2. Find all images
27
+ print(f"πŸ” Scanning images in: {SOURCE_IMAGES}")
28
+ jpgs = glob(os.path.join(SOURCE_IMAGES, "*.jpg"))
29
+ pngs = glob(os.path.join(SOURCE_IMAGES, "*.png"))
30
+ jpeg = glob(os.path.join(SOURCE_IMAGES, "*.jpeg"))
31
+ all_images = jpgs + pngs + jpeg
32
 
33
+ # Shuffle for randomness
34
+ random.shuffle(all_images)
35
 
36
+ total_count = len(all_images)
37
+ train_count = int(total_count * TRAIN_RATIO)
38
+
39
+ if total_count == 0:
40
+ print("Error: No images found! Check your source path.")
41
+ return
42
+
43
+ print(f"πŸ“Š Found {total_count} images.")
44
+ print(f" -> Training: {train_count}")
45
+ print(f" -> Validation: {total_count - train_count}")
46
 
47
+ # 3. Copy Files
48
+ print("πŸ“¦ Organizing files...")
49
+ for i, img_path in enumerate(tqdm(all_images)):
50
+ # Decide split
51
+ split = 'train' if i < train_count else 'val'
52
+
53
  filename = os.path.basename(img_path)
54
+ name_no_ext = os.path.splitext(filename)[0]
 
55
 
56
+ # Paths
57
+ dest_img_path = os.path.join(DEST_DIR, split, 'images', filename)
58
+
59
+ # Find matching label
60
+ src_txt_path = os.path.join(SOURCE_LABELS, name_no_ext + ".txt")
61
+ dest_txt_path = os.path.join(DEST_DIR, split, 'labels', name_no_ext + ".txt")
62
 
63
  # Copy Image
64
+ shutil.copy(img_path, dest_img_path)
65
 
66
+ # Copy Label
67
+ if os.path.exists(src_txt_path):
68
+ shutil.copy(src_txt_path, dest_txt_path)
69
  else:
70
+ print(f" Warning: Missing label for {filename}")
 
 
 
71
 
72
+ print(f"\n Done! Your training data is ready at:")
73
+ print(f" {DEST_DIR}")
74
 
75
  if __name__ == "__main__":
76
+ split_dataset()