""" Build nested training subsets for the data-scaling study. Guarantees: subset_25.txt ⊂ subset_50.txt ⊂ subset_100.txt Method: list training image filenames, sort, shuffle once with seed=42, then take the first 25%, 50%, and 100%. Subsets are written as plain text files (one filename per line) in this directory so both trainers and the dashboard see the exact same partition. """ import os import random from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[3] TRAIN_IMAGES = REPO_ROOT / "final_data" / "train" / "images" SUBSETS_DIR = Path(__file__).resolve().parent SEED = 42 SHARES = [25, 50, 100] def main(): if not TRAIN_IMAGES.is_dir(): raise FileNotFoundError(f"Train images dir not found: {TRAIN_IMAGES}") files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg") n_total = len(files) if n_total == 0: raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}") rng = random.Random(SEED) shuffled = files.copy() rng.shuffle(shuffled) print(f"Total training images: {n_total}") print(f"Seed: {SEED}\n") for share in SHARES: n = int(round(n_total * share / 100)) subset = shuffled[:n] out_path = SUBSETS_DIR / f"subset_{share}.txt" out_path.write_text("\n".join(subset) + "\n") print(f" subset_{share:>3}.txt → {n:>5} files ({share}%)") print("\nNesting check:") s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines()) s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines()) s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines()) assert s25.issubset(s50), "25% is not a subset of 50%" assert s50.issubset(s100), "50% is not a subset of 100%" print(f" 25 ⊂ 50 ✓ ({len(s25)} ⊂ {len(s50)})") print(f" 50 ⊂ 100 ✓ ({len(s50)} ⊂ {len(s100)})") if __name__ == "__main__": main()