File size: 2,411 Bytes
5bbbbc0
 
 
 
c51e7bb
5bbbbc0
429f4df
 
 
 
 
 
 
5bbbbc0
 
429f4df
 
77fe907
 
 
 
 
5bbbbc0
 
 
 
429f4df
 
77fe907
429f4df
77fe907
 
 
 
429f4df
77fe907
 
 
 
 
 
 
c51e7bb
5bbbbc0
c51e7bb
 
 
429f4df
77fe907
429f4df
 
77fe907
c51e7bb
 
5bbbbc0
77fe907
 
c51e7bb
 
 
5bbbbc0
c51e7bb
5bbbbc0
c51e7bb
 
77fe907
c51e7bb
 
77fe907
c51e7bb
77fe907
c51e7bb
 
5bbbbc0
77fe907
429f4df
5bbbbc0
 
429f4df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import shutil
import random
from glob import glob
from tqdm import tqdm

SOURCE_ROOT = r"C:\Users\charu\Documents\goyam\roboflow\train"
SOURCE_IMAGES = os.path.join(SOURCE_ROOT, "images")
SOURCE_LABELS = os.path.join(SOURCE_ROOT, "labels")


DEST_DIR = r"C:\Users\charu\Documents\goyam\roboflow\final_split"

TRAIN_RATIO = 0.8


def split_dataset():
  
    if os.path.exists(DEST_DIR):
        print(f"⚠️ Warning: Destination folder already exists: {DEST_DIR}")
        print("   (Ideally, delete it before running this to avoid mixing old data!)")
    
    for split in ['train', 'val']:
        os.makedirs(os.path.join(DEST_DIR, split, 'images'), exist_ok=True)
        os.makedirs(os.path.join(DEST_DIR, split, 'labels'), exist_ok=True)

  
    print(f"🔍 Scanning images in: {SOURCE_IMAGES}")
    

    unique_images = set()
    
    # Check all extensions
    exts = ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']
    for ext in exts:
        files = glob(os.path.join(SOURCE_IMAGES, ext))
        for f in files:
            unique_images.add(f) 
            
    all_images = list(unique_images)
    

    random.shuffle(all_images)
    
    total_count = len(all_images)
    train_count = int(total_count * TRAIN_RATIO)
    
    if total_count == 0:
        print(" Error: No images found!")
        return

    print(f"Found {total_count} unique images.")
    print(f"   -> Training: {train_count}")
    print(f"   -> Validation: {total_count - train_count}")

    # 3. Copy Files
    print("Organizing files...")
    for i, img_path in enumerate(tqdm(all_images)):
        split = 'train' if i < train_count else 'val'
        
        filename = os.path.basename(img_path)
        name_no_ext = os.path.splitext(filename)[0]
        
        dest_img_path = os.path.join(DEST_DIR, split, 'images', filename)
        
        # Check label (Look for .txt)
        src_txt_path = os.path.join(SOURCE_LABELS, name_no_ext + ".txt")
        dest_txt_path = os.path.join(DEST_DIR, split, 'labels', name_no_ext + ".txt")
        
        shutil.copy(img_path, dest_img_path)
        
        if os.path.exists(src_txt_path):
            shutil.copy(src_txt_path, dest_txt_path)

    print(f"\nDone! Your dataset is ready at:")
    print(f"   {DEST_DIR}")

if __name__ == "__main__":
    split_dataset()