"""LIDC-IDRI manifest loading + CT slice access. Manifest source: Chucks90/eryon-data-pipelines, manifests/lidc/manifest_v1.1.0.jsonl. The manifest is label-rich but pixel data (converted axial-slice PNGs) lives in the interim bucket hf://buckets/Chucks90/eryon-datasets, which is NOT readable with the provided token. So `image_root` must point at a local mirror of the slice PNGs to build a real token bank; otherwise the builder reports a data gap (per IMPLEMENTATION_SPEC ยง7). """ from __future__ import annotations import json import os from dataclasses import dataclass from pathlib import Path from huggingface_hub import hf_hub_download @dataclass class SliceRecord: patient_id: str scan_id: str slice_id: str image_path: str split: str has_nodule: bool raw: dict def _hf_token() -> str | None: return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") def download_manifest(repo_id: str, manifest_path: str, cache_dir: str) -> str: """Fetch the JSONL manifest from the HF dataset repo. Returns a local path.""" return hf_hub_download( repo_id=repo_id, filename=manifest_path, repo_type="dataset", token=_hf_token(), local_dir=cache_dir, ) def iter_manifest(manifest_local_path: str, split: str | None = None): """Yield SliceRecord rows, optionally filtered to a split (e.g. 'train').""" with open(manifest_local_path) as f: for line in f: line = line.strip() if not line: continue rec = json.loads(line) if split is not None and rec.get("split") != split: continue yield SliceRecord( patient_id=rec.get("patient_id", ""), scan_id=rec.get("scan_id", ""), slice_id=rec.get("slice_id", ""), image_path=rec.get("image_path", ""), split=rec.get("split", ""), has_nodule=bool(rec.get("has_nodule", False)), raw=rec, ) def resolve_image(image_root: str | None, image_path: str) -> str | None: """Resolve a manifest image_path to a readable local file, or None if absent.""" if not image_root: return None p = Path(image_root) / image_path return str(p) if p.exists() else None def load_scan_splits(splits_json_path: str) -> dict[str, str]: """Load the LIDC splits file (scan_id -> 'train'|'val'|'test'). Source: Chucks90/eryon-data-pipelines manifests/lidc/splits_v1.0.0.json. This is the patient/scan-level split used to keep the token bank disjoint from eval scans, without needing the 241MB per-slice manifest. """ with open(splits_json_path) as f: return json.load(f)["splits"] def iter_slices_from_tree(image_root: str, scan_splits: dict[str, str], split: str): """Yield (scan_id, png_path) for every slice belonging to scans in `split`. `image_root` is a local mirror of raw/lidc with structure batch_XXXX//slice_NNNN.png. Scans absent from `scan_splits` are skipped (defensively excluded from the held-out bank). """ root = Path(image_root) for batch_dir in sorted(root.glob("batch_*")): if not batch_dir.is_dir(): continue for scan_dir in sorted(batch_dir.iterdir()): if not scan_dir.is_dir(): continue if scan_splits.get(scan_dir.name) != split: continue for png in sorted(scan_dir.glob("slice_*.png")): yield scan_dir.name, str(png)