Campus-AI / scripts /count_splits.py
realruneett's picture
Final Release: CampusGen AI Pipeline & Compositor
a8aea21
import os
from pathlib import Path
# Config
data_root = Path("data")
train_dir = data_root / "train"
val_dir = data_root / "val"
test_dir = data_root / "test"
IMG_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
def count_images_in_dir(d: Path) -> int:
if not d.exists():
return 0
return len([f for f in os.listdir(d) if Path(f).suffix.lower() in IMG_EXTENSIONS])
# Find all categories from processed dir (source of truth)
processed_dir = data_root / "processed"
categories = set()
if processed_dir.exists():
for root, dirs, files in os.walk(processed_dir):
if any(Path(f).suffix.lower() in IMG_EXTENSIONS for f in files):
rel = Path(root).relative_to(processed_dir)
categories.add(str(rel).replace("\\", "/"))
else:
# Fallback: finding categories from splits directly
for d in [train_dir, val_dir, test_dir]:
if d.exists():
for root, dirs, files in os.walk(d):
if any(Path(f).suffix.lower() in IMG_EXTENSIONS for f in files):
rel = Path(root).relative_to(d)
categories.add(str(rel).replace("\\", "/"))
print(f"{'Category':<40} | {'Train':<6} | {'Val':<5} | {'Test':<5} | {'Total':<6} | {'% Train':<8}")
print("-" * 100)
grand_totals = {"train": 0, "val": 0, "test": 0, "total": 0}
for cat in sorted(list(categories)):
c_train = count_images_in_dir(train_dir / cat)
c_val = count_images_in_dir(val_dir / cat)
c_test = count_images_in_dir(test_dir / cat)
total = c_train + c_val + c_test
grand_totals["train"] += c_train
grand_totals["val"] += c_val
grand_totals["test"] += c_test
grand_totals["total"] += total
pct_train = (c_train / total * 100) if total > 0 else 0.0
print(f"{cat:<40} | {c_train:<6} | {c_val:<5} | {c_test:<5} | {total:<6} | {pct_train:.1f}%")
print("-" * 100)
t_train = grand_totals['train']
t_total = grand_totals['total']
t_pct = (t_train / t_total * 100) if t_total > 0 else 0
print(f"{'TOTAL':<40} | {t_train:<6} | {grand_totals['val']:<5} | {grand_totals['test']:<5} | {t_total:<6} | {t_pct:.1f}%")