Spaces:

Goyamproject
/

React_native_app

Sleeping

App Files Files Community

Charuka66 commited on Feb 13

Commit

c51e7bb

verified ·

1 Parent(s): 96b923d

Update dataset splitting script for the 900-image dataset

Browse files

This script organizes the newly augmented 900-image dataset into the standard YOLO directory structure (train and val).
Randomly shuffles and splits the data using an 80% Training / 20% Validation ratio.
Safely maps and copies both images (.jpg, .png, .jpeg) and their corresponding .txt segmentation labels to their respective destination folders.
Includes error handling to flag and log any missing label files during the transfer process to ensure dataset integrity.

Files changed (1) hide show

split.py +49 -32

split.py CHANGED Viewed

@@ -2,58 +2,75 @@ import os
 import shutil
 import random
 from glob import glob
 # ================= CONFIGURATION =================
-# Input Folders (Where your 400 images are now)
-SOURCE_IMAGES = r"C:\Users\charu\Desktop\My_Project\seed_images"
-SOURCE_LABELS = r"C:\Users\charu\Desktop\My_Project\seed_labels"
-# Output Folder (Where the training data will go)
-DEST_DIR = r"C:\Users\charu\Desktop\My_Project\teacher_v2_dataset"
-# Split Ratio (80% Train, 20% Validation)
 TRAIN_RATIO = 0.8
 # =================================================
-def split_data():
-    # 1. Create Folders
     for split in ['train', 'val']:
         os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
         os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
-    # 2. Get List of All Images
-    images = glob(os.path.join(SOURCE_IMAGES, "*.jpg")) + glob(os.path.join(SOURCE_IMAGES, "*.png"))
-    random.shuffle(images)
-    total = len(images)
-    train_count = int(total * TRAIN_RATIO)
-    print(f"📦 Organizing {total} images...")
-    print(f"   - Training: {train_count}")
-    print(f"   - Validation: {total - train_count}")
-    # 3. Move Files
-    for i, img_path in enumerate(images):
         filename = os.path.basename(img_path)
-        label_name = os.path.splitext(filename)[0] + ".txt"
-        label_path = os.path.join(SOURCE_LABELS, label_name)
-        # Decide destination (Train or Val)
-        split = 'train' if i < train_count else 'val'
         # Copy Image
-        shutil.copy(img_path, os.path.join(DEST_DIR, split, 'images', filename))
-        # Copy Label (If it exists)
-        if os.path.exists(label_path):
-            shutil.copy(label_path, os.path.join(DEST_DIR, split, 'labels', label_name))
         else:
-            # If no label exists (shouldn't happen, but just in case), create an empty one for "Healthy"
-            # This ensures YOLO knows it's a background image
-            with open(os.path.join(DEST_DIR, split, 'labels', label_name), 'w') as f:
-                pass
-    print(f"\n✅ Done! Dataset ready at: {DEST_DIR}")
 if __name__ == "__main__":
-    split_data()

 import shutil
 import random
 from glob import glob
+from tqdm import tqdm
 # ================= CONFIGURATION =================
+# 1. INPUT: Your current folder with images/ and labels/
+SOURCE_ROOT = r"C:\Users\charu\Desktop\04-02-2026"
+SOURCE_IMAGES = os.path.join(SOURCE_ROOT, "images")
+SOURCE_LABELS = os.path.join(SOURCE_ROOT, "labels")
+# 2. OUTPUT: Where the ready-to-train data will go
+DEST_DIR = r"C:\Users\charu\Desktop\04-02-2026\final_split_dataset"
+# 3. Split Ratio (80% Train, 20% Validation)
 TRAIN_RATIO = 0.8
 # =================================================
+def split_dataset():
+    # 1. Create Destination Folders
     for split in ['train', 'val']:
         os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
         os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)
+    # 2. Find all images
+    print(f"🔍 Scanning images in: {SOURCE_IMAGES}")
+    jpgs = glob(os.path.join(SOURCE_IMAGES, "*.jpg"))
+    pngs = glob(os.path.join(SOURCE_IMAGES, "*.png"))
+    jpeg = glob(os.path.join(SOURCE_IMAGES, "*.jpeg"))
+    all_images = jpgs + pngs + jpeg
+    # Shuffle for randomness
+    random.shuffle(all_images)
+    total_count = len(all_images)
+    train_count = int(total_count * TRAIN_RATIO)
+    if total_count == 0:
+        print("Error: No images found! Check your source path.")
+        return
+    print(f"📊 Found {total_count} images.")
+    print(f"   -> Training: {train_count}")
+    print(f"   -> Validation: {total_count - train_count}")
+    # 3. Copy Files
+    print("📦 Organizing files...")
+    for i, img_path in enumerate(tqdm(all_images)):
+        # Decide split
+        split = 'train' if i < train_count else 'val'
         filename = os.path.basename(img_path)
+        name_no_ext = os.path.splitext(filename)[0]
+        # Paths
+        dest_img_path = os.path.join(DEST_DIR, split, 'images', filename)
+        # Find matching label
+        src_txt_path = os.path.join(SOURCE_LABELS, name_no_ext + ".txt")
+        dest_txt_path = os.path.join(DEST_DIR, split, 'labels', name_no_ext + ".txt")
         # Copy Image
+        shutil.copy(img_path, dest_img_path)
+        # Copy Label
+        if os.path.exists(src_txt_path):
+            shutil.copy(src_txt_path, dest_txt_path)
         else:
+            print(f" Warning: Missing label for {filename}")
+    print(f"\n Done! Your training data is ready at:")
+    print(f"   {DEST_DIR}")
 if __name__ == "__main__":
+    split_dataset()