| """LIDC-IDRI manifest loading + CT slice access. | |
| Manifest source: Chucks90/eryon-data-pipelines, manifests/lidc/manifest_v1.1.0.jsonl. | |
| The manifest is label-rich but pixel data (converted axial-slice PNGs) lives in the | |
| interim bucket hf://buckets/Chucks90/eryon-datasets, which is NOT readable with the | |
| provided token. So `image_root` must point at a local mirror of the slice PNGs to build | |
| a real token bank; otherwise the builder reports a data gap (per IMPLEMENTATION_SPEC §7). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| class SliceRecord: | |
| patient_id: str | |
| scan_id: str | |
| slice_id: str | |
| image_path: str | |
| split: str | |
| has_nodule: bool | |
| raw: dict | |
| def _hf_token() -> str | None: | |
| return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
| def download_manifest(repo_id: str, manifest_path: str, cache_dir: str) -> str: | |
| """Fetch the JSONL manifest from the HF dataset repo. Returns a local path.""" | |
| return hf_hub_download( | |
| repo_id=repo_id, | |
| filename=manifest_path, | |
| repo_type="dataset", | |
| token=_hf_token(), | |
| local_dir=cache_dir, | |
| ) | |
| def iter_manifest(manifest_local_path: str, split: str | None = None): | |
| """Yield SliceRecord rows, optionally filtered to a split (e.g. 'train').""" | |
| with open(manifest_local_path) as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| rec = json.loads(line) | |
| if split is not None and rec.get("split") != split: | |
| continue | |
| yield SliceRecord( | |
| patient_id=rec.get("patient_id", ""), | |
| scan_id=rec.get("scan_id", ""), | |
| slice_id=rec.get("slice_id", ""), | |
| image_path=rec.get("image_path", ""), | |
| split=rec.get("split", ""), | |
| has_nodule=bool(rec.get("has_nodule", False)), | |
| raw=rec, | |
| ) | |
| def resolve_image(image_root: str | None, image_path: str) -> str | None: | |
| """Resolve a manifest image_path to a readable local file, or None if absent.""" | |
| if not image_root: | |
| return None | |
| p = Path(image_root) / image_path | |
| return str(p) if p.exists() else None | |
| def load_scan_splits(splits_json_path: str) -> dict[str, str]: | |
| """Load the LIDC splits file (scan_id -> 'train'|'val'|'test'). | |
| Source: Chucks90/eryon-data-pipelines manifests/lidc/splits_v1.0.0.json. This is the | |
| patient/scan-level split used to keep the token bank disjoint from eval scans, without | |
| needing the 241MB per-slice manifest. | |
| """ | |
| with open(splits_json_path) as f: | |
| return json.load(f)["splits"] | |
| def iter_slices_from_tree(image_root: str, scan_splits: dict[str, str], split: str): | |
| """Yield (scan_id, png_path) for every slice belonging to scans in `split`. | |
| `image_root` is a local mirror of raw/lidc with structure | |
| batch_XXXX/<scan_id>/slice_NNNN.png. Scans absent from `scan_splits` are skipped | |
| (defensively excluded from the held-out bank). | |
| """ | |
| root = Path(image_root) | |
| for batch_dir in sorted(root.glob("batch_*")): | |
| if not batch_dir.is_dir(): | |
| continue | |
| for scan_dir in sorted(batch_dir.iterdir()): | |
| if not scan_dir.is_dir(): | |
| continue | |
| if scan_splits.get(scan_dir.name) != split: | |
| continue | |
| for png in sorted(scan_dir.glob("slice_*.png")): | |
| yield scan_dir.name, str(png) | |