VERIDEX.V1 / trainer /split_dataset.py
shadow55gh
fix: remove node_modules and cache from tracking
81f9dfe
"""
VERIDEX β€” Dataset Split Script (Fixed v2)
==========================================
dataset/train/real + dataset/train/fake nundi
train(70%) / val(15%) / test(15%) ga split chestundi.
"""
import os, shutil, random
# ── CONFIG ──────────────────────────────────────────────────────
REAL_SRC = "dataset/train/real"
FAKE_SRC = "dataset/train/fake"
DATASET_DIR = "dataset"
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
SEED = 42
SUPPORTED = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff"}
# ── HELPERS ──────────────────────────────────────────────────────
def get_images(folder):
return [
f for f in os.listdir(folder)
if os.path.splitext(f)[1].lower() in SUPPORTED
and not f.startswith(".")
]
def copy_files(files, src_dir, dst_dir):
os.makedirs(dst_dir, exist_ok=True)
src_abs = os.path.abspath(src_dir)
dst_abs = os.path.abspath(dst_dir)
for f in files:
src_path = os.path.join(src_abs, f)
dst_path = os.path.join(dst_abs, f)
if src_path == dst_path:
continue
shutil.copy2(src_path, dst_path)
# ── MAIN ─────────────────────────────────────────────────────────
def split():
random.seed(SEED)
for label, src in [("real", REAL_SRC), ("fake", FAKE_SRC)]:
if not os.path.exists(src):
print(f"ERROR: {src} folder not found!")
return
files = get_images(src)
random.shuffle(files)
total = len(files)
n_train = int(total * TRAIN_RATIO)
n_val = int(total * VAL_RATIO)
train_files = files[:n_train]
val_files = files[n_train:n_train + n_val]
test_files = files[n_train + n_val:]
print(f"\n{label.upper()} ({total:,} images):")
print(f" train β†’ {len(train_files):,} (copying...)")
copy_files(train_files, src, os.path.join(DATASET_DIR, "train", label))
print(f" val β†’ {len(val_files):,} (copying...)")
copy_files(val_files, src, os.path.join(DATASET_DIR, "val", label))
print(f" test β†’ {len(test_files):,} (copying...)")
copy_files(test_files, src, os.path.join(DATASET_DIR, "test", label))
print(f" βœ… {label} done!")
print("\nπŸŽ‰ Split complete!")
print(" Now run: python train_gpu.py")
if __name__ == "__main__":
split()