Spaces:
Sleeping
Sleeping
File size: 2,079 Bytes
52efd90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """
Build nested training subsets for the clean data-scaling study.
Reads filenames from the cleaned dataset (final_data_clean/train/images/) and
writes subset_{25,50,100}.txt with the property:
subset_25.txt β subset_50.txt β subset_100.txt
Method: list filenames, sort, shuffle once with seed=42, take the first 25%, 50%,
100%. The same seed used in the original (leaked) run is reused so the only
intentional change is the dataset content.
"""
import random
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[3]
TRAIN_IMAGES = REPO_ROOT / "final_data_clean" / "train" / "images"
SUBSETS_DIR = Path(__file__).resolve().parent
SEED = 42
SHARES = [25, 50, 100]
def main():
if not TRAIN_IMAGES.is_dir():
raise FileNotFoundError(
f"Cleaned train images dir not found: {TRAIN_IMAGES}\n"
f"Run dedupe_dataset.py first."
)
files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg")
n_total = len(files)
if n_total == 0:
raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}")
rng = random.Random(SEED)
shuffled = files.copy()
rng.shuffle(shuffled)
print(f"Total cleaned training images: {n_total}")
print(f"Seed: {SEED}\n")
for share in SHARES:
n = int(round(n_total * share / 100))
subset = shuffled[:n]
out_path = SUBSETS_DIR / f"subset_{share}.txt"
out_path.write_text("\n".join(subset) + "\n")
print(f" subset_{share:>3}.txt β {n:>5} files ({share}%)")
print("\nNesting check:")
s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines())
s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines())
s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines())
assert s25.issubset(s50), "25% is not a subset of 50%"
assert s50.issubset(s100), "50% is not a subset of 100%"
print(f" 25 β 50 β ({len(s25)} β {len(s50)})")
print(f" 50 β 100 β ({len(s50)} β {len(s100)})")
if __name__ == "__main__":
main()
|