""" Script to split dataset into train, validation, and test sets Proportions: 70% train, 15% validation, 15% test """ import os import shutil from pathlib import Path import random # Set random seed for reproducibility random.seed(42) # Paths SOURCE_DIR = Path('dataset') DEST_DIR = Path('data') # Split ratios TRAIN_RATIO = 0.70 VAL_RATIO = 0.15 TEST_RATIO = 0.15 print("="*80) print("DATASET SPLITTING TOOL") print("="*80) print(f"Source: {SOURCE_DIR}") print(f"Destination: {DEST_DIR}") print(f"Split ratio: Train={TRAIN_RATIO*100}%, Val={VAL_RATIO*100}%, Test={TEST_RATIO*100}%") print("="*80) # Create destination directories for split in ['train', 'val', 'test']: split_dir = DEST_DIR / split if split_dir.exists(): print(f"\nWARNING: {split_dir} already exists!") response = input(f"Delete and recreate? (yes/no): ") if response.lower() == 'yes': shutil.rmtree(split_dir) print(f"Deleted {split_dir}") else: print("Aborting. Please backup or rename existing data directory.") exit() split_dir.mkdir(parents=True, exist_ok=True) # Get all class folders class_folders = [f for f in SOURCE_DIR.iterdir() if f.is_dir()] class_folders = sorted(class_folders) print(f"\nFound {len(class_folders)} classes") print("="*80) total_images = 0 total_train = 0 total_val = 0 total_test = 0 # Process each class for class_folder in class_folders: class_name = class_folder.name # Get all image files image_files = [] for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']: image_files.extend(list(class_folder.glob(ext))) if len(image_files) == 0: print(f"WARNING: No images found in {class_name}") continue # Shuffle images random.shuffle(image_files) # Calculate split indices n_images = len(image_files) n_train = int(n_images * TRAIN_RATIO) n_val = int(n_images * VAL_RATIO) n_test = n_images - n_train - n_val # Remaining goes to test # Split images train_images = image_files[:n_train] val_images = image_files[n_train:n_train + n_val] test_images = image_files[n_train + n_val:] # Create class directories in each split for split in ['train', 'val', 'test']: (DEST_DIR / split / class_name).mkdir(parents=True, exist_ok=True) # Copy images to respective directories for img in train_images: shutil.copy2(img, DEST_DIR / 'train' / class_name / img.name) for img in val_images: shutil.copy2(img, DEST_DIR / 'val' / class_name / img.name) for img in test_images: shutil.copy2(img, DEST_DIR / 'test' / class_name / img.name) # Update counters total_images += n_images total_train += n_train total_val += n_val total_test += n_test print(f"{class_name:40s}: {n_images:4d} total -> Train: {n_train:3d}, Val: {n_val:3d}, Test: {n_test:3d}") print("="*80) print("SUMMARY") print("="*80) print(f"Total images processed: {total_images}") print(f"Train: {total_train} ({total_train/total_images*100:.1f}%)") print(f"Val: {total_val} ({total_val/total_images*100:.1f}%)") print(f"Test: {total_test} ({total_test/total_images*100:.1f}%)") print("="*80) print("\nDataset split completed successfully!") print(f"\nDirectory structure:") print(f" {DEST_DIR}/") print(f" train/ ({total_train} images)") print(f" val/ ({total_val} images)") print(f" test/ ({total_test} images)") print("="*80)