Spaces:
Sleeping
Sleeping
Update dataset splitting script for the 900-image dataset
Browse filesThis script organizes the newly augmented 900-image dataset into the standard YOLO directory structure (train and val).
Randomly shuffles and splits the data using an 80% Training / 20% Validation ratio.
Safely maps and copies both images (.jpg, .png, .jpeg) and their corresponding .txt segmentation labels to their respective destination folders.
Includes error handling to flag and log any missing label files during the transfer process to ensure dataset integrity.
split.py
CHANGED
|
@@ -2,58 +2,75 @@ import os
|
|
| 2 |
import shutil
|
| 3 |
import random
|
| 4 |
from glob import glob
|
|
|
|
| 5 |
|
| 6 |
# ================= CONFIGURATION =================
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
DEST_DIR = r"C:\Users\charu\Desktop\
|
| 13 |
|
| 14 |
-
# Split Ratio (80% Train, 20% Validation)
|
| 15 |
TRAIN_RATIO = 0.8
|
| 16 |
# =================================================
|
| 17 |
|
| 18 |
-
def
|
| 19 |
-
# 1. Create Folders
|
| 20 |
for split in ['train', 'val']:
|
| 21 |
os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
|
| 22 |
os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
|
| 23 |
|
| 24 |
-
# 2.
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
# 3.
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
filename = os.path.basename(img_path)
|
| 38 |
-
|
| 39 |
-
label_path = os.path.join(SOURCE_LABELS, label_name)
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Copy Image
|
| 45 |
-
shutil.copy(img_path,
|
| 46 |
|
| 47 |
-
# Copy Label
|
| 48 |
-
if os.path.exists(
|
| 49 |
-
shutil.copy(
|
| 50 |
else:
|
| 51 |
-
|
| 52 |
-
# This ensures YOLO knows it's a background image
|
| 53 |
-
with open(os.path.join(DEST_DIR, split, 'labels', label_name), 'w') as f:
|
| 54 |
-
pass
|
| 55 |
|
| 56 |
-
print(f"\n
|
|
|
|
| 57 |
|
| 58 |
if __name__ == "__main__":
|
| 59 |
-
|
|
|
|
| 2 |
import shutil
|
| 3 |
import random
|
| 4 |
from glob import glob
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
|
| 7 |
# ================= CONFIGURATION =================
|
| 8 |
+
# 1. INPUT: Your current folder with images/ and labels/
|
| 9 |
+
SOURCE_ROOT = r"C:\Users\charu\Desktop\04-02-2026"
|
| 10 |
+
SOURCE_IMAGES = os.path.join(SOURCE_ROOT, "images")
|
| 11 |
+
SOURCE_LABELS = os.path.join(SOURCE_ROOT, "labels")
|
| 12 |
|
| 13 |
+
# 2. OUTPUT: Where the ready-to-train data will go
|
| 14 |
+
DEST_DIR = r"C:\Users\charu\Desktop\04-02-2026\final_split_dataset"
|
| 15 |
|
| 16 |
+
# 3. Split Ratio (80% Train, 20% Validation)
|
| 17 |
TRAIN_RATIO = 0.8
|
| 18 |
# =================================================
|
| 19 |
|
| 20 |
+
def split_dataset():
|
| 21 |
+
# 1. Create Destination Folders
|
| 22 |
for split in ['train', 'val']:
|
| 23 |
os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
|
| 24 |
os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
|
| 25 |
|
| 26 |
+
# 2. Find all images
|
| 27 |
+
print(f"π Scanning images in: {SOURCE_IMAGES}")
|
| 28 |
+
jpgs = glob(os.path.join(SOURCE_IMAGES, "*.jpg"))
|
| 29 |
+
pngs = glob(os.path.join(SOURCE_IMAGES, "*.png"))
|
| 30 |
+
jpeg = glob(os.path.join(SOURCE_IMAGES, "*.jpeg"))
|
| 31 |
+
all_images = jpgs + pngs + jpeg
|
| 32 |
|
| 33 |
+
# Shuffle for randomness
|
| 34 |
+
random.shuffle(all_images)
|
| 35 |
|
| 36 |
+
total_count = len(all_images)
|
| 37 |
+
train_count = int(total_count * TRAIN_RATIO)
|
| 38 |
+
|
| 39 |
+
if total_count == 0:
|
| 40 |
+
print("Error: No images found! Check your source path.")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
print(f"π Found {total_count} images.")
|
| 44 |
+
print(f" -> Training: {train_count}")
|
| 45 |
+
print(f" -> Validation: {total_count - train_count}")
|
| 46 |
|
| 47 |
+
# 3. Copy Files
|
| 48 |
+
print("π¦ Organizing files...")
|
| 49 |
+
for i, img_path in enumerate(tqdm(all_images)):
|
| 50 |
+
# Decide split
|
| 51 |
+
split = 'train' if i < train_count else 'val'
|
| 52 |
+
|
| 53 |
filename = os.path.basename(img_path)
|
| 54 |
+
name_no_ext = os.path.splitext(filename)[0]
|
|
|
|
| 55 |
|
| 56 |
+
# Paths
|
| 57 |
+
dest_img_path = os.path.join(DEST_DIR, split, 'images', filename)
|
| 58 |
+
|
| 59 |
+
# Find matching label
|
| 60 |
+
src_txt_path = os.path.join(SOURCE_LABELS, name_no_ext + ".txt")
|
| 61 |
+
dest_txt_path = os.path.join(DEST_DIR, split, 'labels', name_no_ext + ".txt")
|
| 62 |
|
| 63 |
# Copy Image
|
| 64 |
+
shutil.copy(img_path, dest_img_path)
|
| 65 |
|
| 66 |
+
# Copy Label
|
| 67 |
+
if os.path.exists(src_txt_path):
|
| 68 |
+
shutil.copy(src_txt_path, dest_txt_path)
|
| 69 |
else:
|
| 70 |
+
print(f" Warning: Missing label for {filename}")
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
print(f"\n Done! Your training data is ready at:")
|
| 73 |
+
print(f" {DEST_DIR}")
|
| 74 |
|
| 75 |
if __name__ == "__main__":
|
| 76 |
+
split_dataset()
|