""" Build nested training subsets for the clean data-scaling study. Reads filenames from the cleaned dataset (final_data_clean/train/images/) and writes subset_{25,50,100}.txt with the property: subset_25.txt ⊂ subset_50.txt ⊂ subset_100.txt Method: list filenames, sort, shuffle once with seed=42, take the first 25%, 50%, 100%. The same seed used in the original (leaked) run is reused so the only intentional change is the dataset content. """ import random from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[3] TRAIN_IMAGES = REPO_ROOT / "final_data_clean" / "train" / "images" SUBSETS_DIR = Path(__file__).resolve().parent SEED = 42 SHARES = [25, 50, 100] def main(): if not TRAIN_IMAGES.is_dir(): raise FileNotFoundError( f"Cleaned train images dir not found: {TRAIN_IMAGES}\n" f"Run dedupe_dataset.py first." ) files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg") n_total = len(files) if n_total == 0: raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}") rng = random.Random(SEED) shuffled = files.copy() rng.shuffle(shuffled) print(f"Total cleaned training images: {n_total}") print(f"Seed: {SEED}\n") for share in SHARES: n = int(round(n_total * share / 100)) subset = shuffled[:n] out_path = SUBSETS_DIR / f"subset_{share}.txt" out_path.write_text("\n".join(subset) + "\n") print(f" subset_{share:>3}.txt → {n:>5} files ({share}%)") print("\nNesting check:") s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines()) s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines()) s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines()) assert s25.issubset(s50), "25% is not a subset of 50%" assert s50.issubset(s100), "50% is not a subset of 100%" print(f" 25 ⊂ 50 ✓ ({len(s25)} ⊂ {len(s50)})") print(f" 50 ⊂ 100 ✓ ({len(s50)} ⊂ {len(s100)})") if __name__ == "__main__": main()