Mohamed-ENNHIRI
Initial commit: code, metric logs, and report
35839ff
Raw
History Blame Contribute Delete
1.93 kB
"""
Build nested training subsets for the data-scaling study.
Guarantees:
subset_25.txt βŠ‚ subset_50.txt βŠ‚ subset_100.txt
Method: list training image filenames, sort, shuffle once with seed=42,
then take the first 25%, 50%, and 100%. Subsets are written as plain text
files (one filename per line) in this directory so both trainers and the
dashboard see the exact same partition.
"""
import os
import random
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[3]
TRAIN_IMAGES = REPO_ROOT / "final_data" / "train" / "images"
SUBSETS_DIR = Path(__file__).resolve().parent
SEED = 42
SHARES = [25, 50, 100]
def main():
if not TRAIN_IMAGES.is_dir():
raise FileNotFoundError(f"Train images dir not found: {TRAIN_IMAGES}")
files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg")
n_total = len(files)
if n_total == 0:
raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}")
rng = random.Random(SEED)
shuffled = files.copy()
rng.shuffle(shuffled)
print(f"Total training images: {n_total}")
print(f"Seed: {SEED}\n")
for share in SHARES:
n = int(round(n_total * share / 100))
subset = shuffled[:n]
out_path = SUBSETS_DIR / f"subset_{share}.txt"
out_path.write_text("\n".join(subset) + "\n")
print(f" subset_{share:>3}.txt β†’ {n:>5} files ({share}%)")
print("\nNesting check:")
s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines())
s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines())
s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines())
assert s25.issubset(s50), "25% is not a subset of 50%"
assert s50.issubset(s100), "50% is not a subset of 100%"
print(f" 25 βŠ‚ 50 βœ“ ({len(s25)} βŠ‚ {len(s50)})")
print(f" 50 βŠ‚ 100 βœ“ ({len(s50)} βŠ‚ {len(s100)})")
if __name__ == "__main__":
main()