Spaces:
Running
Running
| """ | |
| Build nested training subsets for the clean data-scaling study. | |
| Reads filenames from the cleaned dataset (final_data_clean/train/images/) and | |
| writes subset_{25,50,100}.txt with the property: | |
| subset_25.txt β subset_50.txt β subset_100.txt | |
| Method: list filenames, sort, shuffle once with seed=42, take the first 25%, 50%, | |
| 100%. The same seed used in the original (leaked) run is reused so the only | |
| intentional change is the dataset content. | |
| """ | |
| import random | |
| from pathlib import Path | |
| REPO_ROOT = Path(__file__).resolve().parents[3] | |
| TRAIN_IMAGES = REPO_ROOT / "final_data_clean" / "train" / "images" | |
| SUBSETS_DIR = Path(__file__).resolve().parent | |
| SEED = 42 | |
| SHARES = [25, 50, 100] | |
| def main(): | |
| if not TRAIN_IMAGES.is_dir(): | |
| raise FileNotFoundError( | |
| f"Cleaned train images dir not found: {TRAIN_IMAGES}\n" | |
| f"Run dedupe_dataset.py first." | |
| ) | |
| files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg") | |
| n_total = len(files) | |
| if n_total == 0: | |
| raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}") | |
| rng = random.Random(SEED) | |
| shuffled = files.copy() | |
| rng.shuffle(shuffled) | |
| print(f"Total cleaned training images: {n_total}") | |
| print(f"Seed: {SEED}\n") | |
| for share in SHARES: | |
| n = int(round(n_total * share / 100)) | |
| subset = shuffled[:n] | |
| out_path = SUBSETS_DIR / f"subset_{share}.txt" | |
| out_path.write_text("\n".join(subset) + "\n") | |
| print(f" subset_{share:>3}.txt β {n:>5} files ({share}%)") | |
| print("\nNesting check:") | |
| s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines()) | |
| s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines()) | |
| s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines()) | |
| assert s25.issubset(s50), "25% is not a subset of 50%" | |
| assert s50.issubset(s100), "50% is not a subset of 100%" | |
| print(f" 25 β 50 β ({len(s25)} β {len(s50)})") | |
| print(f" 50 β 100 β ({len(s50)} β {len(s100)})") | |
| if __name__ == "__main__": | |
| main() | |