covtoken: label-free lesion-subspace token economy (reframed) + gated eval + paper draft

3510f1d verified 9 days ago

3.61 kB

	"""LIDC-IDRI manifest loading + CT slice access.

	Manifest source: Chucks90/eryon-data-pipelines, manifests/lidc/manifest_v1.1.0.jsonl.
	The manifest is label-rich but pixel data (converted axial-slice PNGs) lives in the
	interim bucket hf://buckets/Chucks90/eryon-datasets, which is NOT readable with the
	provided token. So `image_root` must point at a local mirror of the slice PNGs to build
	a real token bank; otherwise the builder reports a data gap (per IMPLEMENTATION_SPEC §7).
	"""
	from __future__ import annotations

	import json
	import os
	from dataclasses import dataclass
	from pathlib import Path

	from huggingface_hub import hf_hub_download


	@dataclass
	class SliceRecord:
	patient_id: str
	scan_id: str
	slice_id: str
	image_path: str
	split: str
	has_nodule: bool
	raw: dict


	def _hf_token() -> str \| None:
	return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")


	def download_manifest(repo_id: str, manifest_path: str, cache_dir: str) -> str:
	"""Fetch the JSONL manifest from the HF dataset repo. Returns a local path."""
	return hf_hub_download(
	repo_id=repo_id,
	filename=manifest_path,
	repo_type="dataset",
	token=_hf_token(),
	local_dir=cache_dir,
	)


	def iter_manifest(manifest_local_path: str, split: str \| None = None):
	"""Yield SliceRecord rows, optionally filtered to a split (e.g. 'train')."""
	with open(manifest_local_path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	rec = json.loads(line)
	if split is not None and rec.get("split") != split:
	continue
	yield SliceRecord(
	patient_id=rec.get("patient_id", ""),
	scan_id=rec.get("scan_id", ""),
	slice_id=rec.get("slice_id", ""),
	image_path=rec.get("image_path", ""),
	split=rec.get("split", ""),
	has_nodule=bool(rec.get("has_nodule", False)),
	raw=rec,
	)


	def resolve_image(image_root: str \| None, image_path: str) -> str \| None:
	"""Resolve a manifest image_path to a readable local file, or None if absent."""
	if not image_root:
	return None
	p = Path(image_root) / image_path
	return str(p) if p.exists() else None


	def load_scan_splits(splits_json_path: str) -> dict[str, str]:
	"""Load the LIDC splits file (scan_id -> 'train'\|'val'\|'test').

	Source: Chucks90/eryon-data-pipelines manifests/lidc/splits_v1.0.0.json. This is the
	patient/scan-level split used to keep the token bank disjoint from eval scans, without
	needing the 241MB per-slice manifest.
	"""
	with open(splits_json_path) as f:
	return json.load(f)["splits"]


	def iter_slices_from_tree(image_root: str, scan_splits: dict[str, str], split: str):
	"""Yield (scan_id, png_path) for every slice belonging to scans in `split`.

	`image_root` is a local mirror of raw/lidc with structure
	batch_XXXX/<scan_id>/slice_NNNN.png. Scans absent from `scan_splits` are skipped
	(defensively excluded from the held-out bank).
	"""
	root = Path(image_root)
	for batch_dir in sorted(root.glob("batch_*")):
	if not batch_dir.is_dir():
	continue
	for scan_dir in sorted(batch_dir.iterdir()):
	if not scan_dir.is_dir():
	continue
	if scan_splits.get(scan_dir.name) != split:
	continue
	for png in sorted(scan_dir.glob("slice_*.png")):
	yield scan_dir.name, str(png)