Spaces:

phiniqs
/

seg-models

Running

seg-models / experiments /clean_data_scaling_study /subsets /make_subsets.py

Mohamed-ENNHIRI

Solar Panel Segmentation app for HF Spaces

52efd90 14 days ago

2.08 kB

	"""
	Build nested training subsets for the clean data-scaling study.

	Reads filenames from the cleaned dataset (final_data_clean/train/images/) and
	writes subset_{25,50,100}.txt with the property:

	subset_25.txt ⊂ subset_50.txt ⊂ subset_100.txt

	Method: list filenames, sort, shuffle once with seed=42, take the first 25%, 50%,
	100%. The same seed used in the original (leaked) run is reused so the only
	intentional change is the dataset content.
	"""
	import random
	from pathlib import Path

	REPO_ROOT = Path(__file__).resolve().parents[3]
	TRAIN_IMAGES = REPO_ROOT / "final_data_clean" / "train" / "images"
	SUBSETS_DIR = Path(__file__).resolve().parent

	SEED = 42
	SHARES = [25, 50, 100]


	def main():
	if not TRAIN_IMAGES.is_dir():
	raise FileNotFoundError(
	f"Cleaned train images dir not found: {TRAIN_IMAGES}\n"
	f"Run dedupe_dataset.py first."
	)

	files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg")
	n_total = len(files)
	if n_total == 0:
	raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}")

	rng = random.Random(SEED)
	shuffled = files.copy()
	rng.shuffle(shuffled)

	print(f"Total cleaned training images: {n_total}")
	print(f"Seed: {SEED}\n")

	for share in SHARES:
	n = int(round(n_total * share / 100))
	subset = shuffled[:n]
	out_path = SUBSETS_DIR / f"subset_{share}.txt"
	out_path.write_text("\n".join(subset) + "\n")
	print(f" subset_{share:>3}.txt → {n:>5} files ({share}%)")

	print("\nNesting check:")
	s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines())
	s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines())
	s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines())
	assert s25.issubset(s50), "25% is not a subset of 50%"
	assert s50.issubset(s100), "50% is not a subset of 100%"
	print(f" 25 ⊂ 50 ✓ ({len(s25)} ⊂ {len(s50)})")
	print(f" 50 ⊂ 100 ✓ ({len(s50)} ⊂ {len(s100)})")


	if __name__ == "__main__":
	main()