"""
Build nested training subsets for the data-scaling study.

Guarantees:
  subset_25.txt  ⊂  subset_50.txt  ⊂  subset_100.txt

Method: list training image filenames, sort, shuffle once with seed=42,
then take the first 25%, 50%, and 100%. Subsets are written as plain text
files (one filename per line) in this directory so both trainers and the
dashboard see the exact same partition.
"""
import os
import random
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[3]
TRAIN_IMAGES = REPO_ROOT / "final_data" / "train" / "images"
SUBSETS_DIR = Path(__file__).resolve().parent

SEED = 42
SHARES = [25, 50, 100]


def main():
    if not TRAIN_IMAGES.is_dir():
        raise FileNotFoundError(f"Train images dir not found: {TRAIN_IMAGES}")

    files = sorted(p.name for p in TRAIN_IMAGES.iterdir() if p.suffix == ".jpg")
    n_total = len(files)
    if n_total == 0:
        raise RuntimeError(f"No .jpg files found in {TRAIN_IMAGES}")

    rng = random.Random(SEED)
    shuffled = files.copy()
    rng.shuffle(shuffled)

    print(f"Total training images: {n_total}")
    print(f"Seed: {SEED}\n")

    for share in SHARES:
        n = int(round(n_total * share / 100))
        subset = shuffled[:n]
        out_path = SUBSETS_DIR / f"subset_{share}.txt"
        out_path.write_text("\n".join(subset) + "\n")
        print(f"  subset_{share:>3}.txt  →  {n:>5} files  ({share}%)")

    print("\nNesting check:")
    s25 = set((SUBSETS_DIR / "subset_25.txt").read_text().splitlines())
    s50 = set((SUBSETS_DIR / "subset_50.txt").read_text().splitlines())
    s100 = set((SUBSETS_DIR / "subset_100.txt").read_text().splitlines())
    assert s25.issubset(s50), "25% is not a subset of 50%"
    assert s50.issubset(s100), "50% is not a subset of 100%"
    print(f"  25 ⊂ 50  ✓ ({len(s25)} ⊂ {len(s50)})")
    print(f"  50 ⊂ 100 ✓ ({len(s50)} ⊂ {len(s100)})")


if __name__ == "__main__":
    main()