File size: 2,688 Bytes
81f9dfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
VERIDEX β€” Dataset Split Script (Fixed v2)
==========================================
dataset/train/real + dataset/train/fake nundi
train(70%) / val(15%) / test(15%) ga split chestundi.
"""
import os, shutil, random

# ── CONFIG ──────────────────────────────────────────────────────
REAL_SRC    = "dataset/train/real"
FAKE_SRC    = "dataset/train/fake"
DATASET_DIR = "dataset"

TRAIN_RATIO = 0.70
VAL_RATIO   = 0.15
SEED        = 42
SUPPORTED   = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff"}

# ── HELPERS ──────────────────────────────────────────────────────
def get_images(folder):
    return [
        f for f in os.listdir(folder)
        if os.path.splitext(f)[1].lower() in SUPPORTED
        and not f.startswith(".")
    ]

def copy_files(files, src_dir, dst_dir):
    os.makedirs(dst_dir, exist_ok=True)
    src_abs = os.path.abspath(src_dir)
    dst_abs = os.path.abspath(dst_dir)
    for f in files:
        src_path = os.path.join(src_abs, f)
        dst_path = os.path.join(dst_abs, f)
        if src_path == dst_path:
            continue
        shutil.copy2(src_path, dst_path)

# ── MAIN ─────────────────────────────────────────────────────────
def split():
    random.seed(SEED)

    for label, src in [("real", REAL_SRC), ("fake", FAKE_SRC)]:
        if not os.path.exists(src):
            print(f"ERROR: {src} folder not found!")
            return

        files = get_images(src)
        random.shuffle(files)
        total = len(files)

        n_train = int(total * TRAIN_RATIO)
        n_val   = int(total * VAL_RATIO)

        train_files = files[:n_train]
        val_files   = files[n_train:n_train + n_val]
        test_files  = files[n_train + n_val:]

        print(f"\n{label.upper()} ({total:,} images):")
        print(f"  train β†’ {len(train_files):,}  (copying...)")
        copy_files(train_files, src, os.path.join(DATASET_DIR, "train", label))
        print(f"  val   β†’ {len(val_files):,}  (copying...)")
        copy_files(val_files,   src, os.path.join(DATASET_DIR, "val",   label))
        print(f"  test  β†’ {len(test_files):,}  (copying...)")
        copy_files(test_files,  src, os.path.join(DATASET_DIR, "test",  label))
        print(f"  βœ… {label} done!")

    print("\nπŸŽ‰ Split complete!")
    print("   Now run: python train_gpu.py")

if __name__ == "__main__":
    split()