covtoken / data /loaders.py
Chucks90's picture
covtoken: label-free lesion-subspace token economy (reframed) + gated eval + paper draft
3510f1d verified
Raw
History Blame Contribute Delete
3.61 kB
"""LIDC-IDRI manifest loading + CT slice access.
Manifest source: Chucks90/eryon-data-pipelines, manifests/lidc/manifest_v1.1.0.jsonl.
The manifest is label-rich but pixel data (converted axial-slice PNGs) lives in the
interim bucket hf://buckets/Chucks90/eryon-datasets, which is NOT readable with the
provided token. So `image_root` must point at a local mirror of the slice PNGs to build
a real token bank; otherwise the builder reports a data gap (per IMPLEMENTATION_SPEC §7).
"""
from __future__ import annotations
import json
import os
from dataclasses import dataclass
from pathlib import Path
from huggingface_hub import hf_hub_download
@dataclass
class SliceRecord:
patient_id: str
scan_id: str
slice_id: str
image_path: str
split: str
has_nodule: bool
raw: dict
def _hf_token() -> str | None:
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
def download_manifest(repo_id: str, manifest_path: str, cache_dir: str) -> str:
"""Fetch the JSONL manifest from the HF dataset repo. Returns a local path."""
return hf_hub_download(
repo_id=repo_id,
filename=manifest_path,
repo_type="dataset",
token=_hf_token(),
local_dir=cache_dir,
)
def iter_manifest(manifest_local_path: str, split: str | None = None):
"""Yield SliceRecord rows, optionally filtered to a split (e.g. 'train')."""
with open(manifest_local_path) as f:
for line in f:
line = line.strip()
if not line:
continue
rec = json.loads(line)
if split is not None and rec.get("split") != split:
continue
yield SliceRecord(
patient_id=rec.get("patient_id", ""),
scan_id=rec.get("scan_id", ""),
slice_id=rec.get("slice_id", ""),
image_path=rec.get("image_path", ""),
split=rec.get("split", ""),
has_nodule=bool(rec.get("has_nodule", False)),
raw=rec,
)
def resolve_image(image_root: str | None, image_path: str) -> str | None:
"""Resolve a manifest image_path to a readable local file, or None if absent."""
if not image_root:
return None
p = Path(image_root) / image_path
return str(p) if p.exists() else None
def load_scan_splits(splits_json_path: str) -> dict[str, str]:
"""Load the LIDC splits file (scan_id -> 'train'|'val'|'test').
Source: Chucks90/eryon-data-pipelines manifests/lidc/splits_v1.0.0.json. This is the
patient/scan-level split used to keep the token bank disjoint from eval scans, without
needing the 241MB per-slice manifest.
"""
with open(splits_json_path) as f:
return json.load(f)["splits"]
def iter_slices_from_tree(image_root: str, scan_splits: dict[str, str], split: str):
"""Yield (scan_id, png_path) for every slice belonging to scans in `split`.
`image_root` is a local mirror of raw/lidc with structure
batch_XXXX/<scan_id>/slice_NNNN.png. Scans absent from `scan_splits` are skipped
(defensively excluded from the held-out bank).
"""
root = Path(image_root)
for batch_dir in sorted(root.glob("batch_*")):
if not batch_dir.is_dir():
continue
for scan_dir in sorted(batch_dir.iterdir()):
if not scan_dir.is_dir():
continue
if scan_splits.get(scan_dir.name) != split:
continue
for png in sorted(scan_dir.glob("slice_*.png")):
yield scan_dir.name, str(png)