Spaces:

phiniqs
/

seg-models

Running

seg-models / experiments /data_scaling_study /subsets /make_subsets.py

Mohamed-ENNHIRI

Initial commit: code, metric logs, and report

35839ff 2 months ago

1.93 kB

	"""
	Build nested training subsets for the data-scaling study.

	Guarantees:
	subset_25.txt ⊂ subset_50.txt ⊂ subset_100.txt

	Method: list training image filenames, sort, shuffle once with seed=42,
	then take the first 25%, 50%, and 100%. Subsets are written as plain text
	files (one filename per line) in this directory so both trainers and the
	dashboard see the exact same partition.
	"""
	import os
	import random
	from pathlib import Path

	REPO_ROOT = Path(__file__).resolve().parents[3]
	TRAIN_IMAGES = REPO_ROOT / "final_data" / "train" / "images"
	SUBSETS_DIR = Path(__file__).resolve().parent

	SEED = 42
	SHARES = [25, 50, 100]


	def main():
	if not TRAIN_IMAGES.is_dir():
	raise FileNotFoundError(f"Train images dir not found: {TRAIN_IMAGES}")

	files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg")
	n_total = len(files)
	if n_total == 0:
	raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}")

	rng = random.Random(SEED)
	shuffled = files.copy()
	rng.shuffle(shuffled)

	print(f"Total training images: {n_total}")
	print(f"Seed: {SEED}\n")

	for share in SHARES:
	n = int(round(n_total * share / 100))
	subset = shuffled[:n]
	out_path = SUBSETS_DIR / f"subset_{share}.txt"
	out_path.write_text("\n".join(subset) + "\n")
	print(f" subset_{share:>3}.txt → {n:>5} files ({share}%)")

	print("\nNesting check:")
	s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines())
	s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines())
	s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines())
	assert s25.issubset(s50), "25% is not a subset of 50%"
	assert s50.issubset(s100), "50% is not a subset of 100%"
	print(f" 25 ⊂ 50 ✓ ({len(s25)} ⊂ {len(s50)})")
	print(f" 50 ⊂ 100 ✓ ({len(s50)} ⊂ {len(s100)})")


	if __name__ == "__main__":
	main()