import os import shutil import random from glob import glob from tqdm import tqdm SOURCE_ROOT = r"C:\Users\charu\Documents\goyam\roboflow\train" SOURCE_IMAGES = os.path.join(SOURCE_ROOT, "images") SOURCE_LABELS = os.path.join(SOURCE_ROOT, "labels") DEST_DIR = r"C:\Users\charu\Documents\goyam\roboflow\final_split" TRAIN_RATIO = 0.8 def split_dataset(): if os.path.exists(DEST_DIR): print(f"⚠️ Warning: Destination folder already exists: {DEST_DIR}") print(" (Ideally, delete it before running this to avoid mixing old data!)") for split in ['train', 'val']: os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True) os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True) print(f"🔍 Scanning images in: {SOURCE_IMAGES}") unique_images = set() # Check all extensions exts = ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG'] for ext in exts: files = glob(os.path.join(SOURCE_IMAGES, ext)) for f in files: unique_images.add(f) all_images = list(unique_images) random.shuffle(all_images) total_count = len(all_images) train_count = int(total_count * TRAIN_RATIO) if total_count == 0: print(" Error: No images found!") return print(f"Found {total_count} unique images.") print(f" -> Training: {train_count}") print(f" -> Validation: {total_count - train_count}") # 3. Copy Files print("Organizing files...") for i, img_path in enumerate(tqdm(all_images)): split = 'train' if i < train_count else 'val' filename = os.path.basename(img_path) name_no_ext = os.path.splitext(filename)[0] dest_img_path = os.path.join(DEST_DIR, split, 'images', filename) # Check label (Look for .txt) src_txt_path = os.path.join(SOURCE_LABELS, name_no_ext + ".txt") dest_txt_path = os.path.join(DEST_DIR, split, 'labels', name_no_ext + ".txt") shutil.copy(img_path, dest_img_path) if os.path.exists(src_txt_path): shutil.copy(src_txt_path, dest_txt_path) print(f"\nDone! Your dataset is ready at:") print(f" {DEST_DIR}") if __name__ == "__main__": split_dataset()