File size: 2,079 Bytes
52efd90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Build nested training subsets for the clean data-scaling study.

Reads filenames from the cleaned dataset (final_data_clean/train/images/) and
writes subset_{25,50,100}.txt with the property:

    subset_25.txt  βŠ‚  subset_50.txt  βŠ‚  subset_100.txt

Method: list filenames, sort, shuffle once with seed=42, take the first 25%, 50%,
100%. The same seed used in the original (leaked) run is reused so the only
intentional change is the dataset content.
"""
import random
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[3]
TRAIN_IMAGES = REPO_ROOT / "final_data_clean" / "train" / "images"
SUBSETS_DIR = Path(__file__).resolve().parent

SEED = 42
SHARES = [25, 50, 100]


def main():
    if not TRAIN_IMAGES.is_dir():
        raise FileNotFoundError(
            f"Cleaned train images dir not found: {TRAIN_IMAGES}\n"
            f"Run dedupe_dataset.py first."
        )

    files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg")
    n_total = len(files)
    if n_total == 0:
        raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}")

    rng = random.Random(SEED)
    shuffled = files.copy()
    rng.shuffle(shuffled)

    print(f"Total cleaned training images: {n_total}")
    print(f"Seed: {SEED}\n")

    for share in SHARES:
        n = int(round(n_total * share / 100))
        subset = shuffled[:n]
        out_path = SUBSETS_DIR / f"subset_{share}.txt"
        out_path.write_text("\n".join(subset) + "\n")
        print(f"  subset_{share:>3}.txt  β†’  {n:>5} files  ({share}%)")

    print("\nNesting check:")
    s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines())
    s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines())
    s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines())
    assert s25.issubset(s50), "25% is not a subset of 50%"
    assert s50.issubset(s100), "50% is not a subset of 100%"
    print(f"  25 βŠ‚ 50  βœ“ ({len(s25)} βŠ‚ {len(s50)})")
    print(f"  50 βŠ‚ 100 βœ“ ({len(s50)} βŠ‚ {len(s100)})")


if __name__ == "__main__":
    main()