Spaces:

VibecoderMcSwaggins
/

stroke-deepisles-demo

Paused

App Files Files Community

stroke-deepisles-demo / src /stroke_deepisles_demo /data /loader.py

VibecoderMcSwaggins

fix(hf): avoid eager 27GB dataset download on /api/cases (#46)

7e5ddec unverified 1 day ago

raw

history blame contribute delete

11.9 kB

	"""Load ISLES24 data from local directory or HuggingFace Hub."""

	from __future__ import annotations

	import re
	import shutil
	import tempfile
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import TYPE_CHECKING, Protocol, Self

	from stroke_deepisles_demo.core.logging import get_logger
	from stroke_deepisles_demo.core.types import CaseFiles # noqa: TC001
	from stroke_deepisles_demo.data.isles24_manifest import (
	ISLES24_DATASET_ID,
	ISLES24_DATASET_REVISION,
	ISLES24_TRAIN_CASE_IDS,
	isles24_train_data_file,
	)

	# Security: Regex for valid ISLES24 subject IDs (defense-in-depth)
	# Expected format: sub-strokeXXXX (e.g., sub-stroke0001)
	_SAFE_SUBJECT_ID_PATTERN = re.compile(r"^sub-stroke\d{4}$")

	if TYPE_CHECKING:
	from datasets import Dataset as HFDataset

	logger = get_logger(__name__)


	class Dataset(Protocol):
	"""Protocol for dataset access.

	All dataset implementations support context manager usage for proper cleanup:

	with load_isles_dataset() as ds:
	case = ds.get_case(0)
	# ... process case ...
	# cleanup happens automatically
	"""

	def __len__(self) -> int: ...
	def __enter__(self) -> Self: ...
	def __exit__(self, *args: object) -> None: ...
	def list_case_ids(self) -> list[str]: ...
	def get_case(self, case_id: str \| int) -> CaseFiles: ...
	def cleanup(self) -> None: ...


	@dataclass
	class DatasetInfo:
	"""Metadata about the dataset."""

	source: str # "local" or HF dataset ID
	num_cases: int
	modalities: list[str]
	has_ground_truth: bool


	@dataclass
	class HuggingFaceDatasetWrapper:
	"""Wrapper for HuggingFace dataset to match the Dataset protocol.

	Uses the standard datasets library (with neuroimaging-go-brrrr patched Nifti feature)
	to load data. Materializes NIfTI images to temporary files on demand.
	"""

	dataset: HFDataset
	dataset_id: str
	_temp_dir: Path \| None = field(default=None, repr=False)
	_case_id_to_index: dict[str, int] = field(default_factory=dict, repr=False)

	def __post_init__(self) -> None:
	"""Build index of subject IDs for O(1) lookup."""
	try:
	# Efficiently build index from 'subject_id' column
	self._case_id_to_index = {
	sid: idx for idx, sid in enumerate(self.dataset["subject_id"])
	}
	except (KeyError, TypeError, ValueError) as e:
	logger.warning(
	"Failed to build index from subject_id column: %s. Fallback to iteration.", e
	)
	for idx, item in enumerate(self.dataset):
	self._case_id_to_index[item["subject_id"]] = idx

	def __len__(self) -> int:
	return len(self.dataset)

	def __enter__(self) -> Self:
	return self

	def __exit__(self, *args: object) -> None:
	self.cleanup()

	def list_case_ids(self) -> list[str]:
	return sorted(self._case_id_to_index.keys())

	def get_case(self, case_id: str \| int) -> CaseFiles:
	"""Get files for a case by ID or index.

	Materializes NIfTI objects to temporary files.
	"""
	# Resolve case_id to index
	if isinstance(case_id, int):
	if case_id < 0 or case_id >= len(self.dataset):
	raise IndexError(f"Case index {case_id} out of range")
	idx = case_id
	else:
	if case_id not in self._case_id_to_index:
	raise KeyError(f"Case ID {case_id} not found")
	idx = self._case_id_to_index[case_id]

	row = self.dataset[idx]
	subject_id = row["subject_id"]

	# Security: Validate subject_id before using in path (defense-in-depth)
	if not _SAFE_SUBJECT_ID_PATTERN.match(subject_id):
	raise ValueError(
	f"Invalid subject_id format: {subject_id!r}. Expected format: sub-strokeXXXX"
	)

	# Prepare temp dir
	if self._temp_dir is None:
	self._temp_dir = Path(tempfile.mkdtemp(prefix="isles24_hf_wrapper_"))

	case_dir = self._temp_dir / subject_id
	case_dir.mkdir(exist_ok=True)

	dwi_path = case_dir / f"{subject_id}_dwi.nii.gz"
	adc_path = case_dir / f"{subject_id}_adc.nii.gz"

	# Materialize files if they don't exist
	if not dwi_path.exists():
	row["dwi"].to_filename(str(dwi_path))

	if not adc_path.exists():
	row["adc"].to_filename(str(adc_path))

	case_files: CaseFiles = {
	"dwi": dwi_path,
	"adc": adc_path,
	}

	# Handle lesion mask (mapped to ground_truth)
	if "lesion_mask" in row and row["lesion_mask"] is not None:
	mask_path = case_dir / f"{subject_id}_lesion-msk.nii.gz"
	if not mask_path.exists():
	row["lesion_mask"].to_filename(str(mask_path))
	case_files["ground_truth"] = mask_path

	return case_files

	def cleanup(self) -> None:
	if self._temp_dir and self._temp_dir.exists():
	try:
	shutil.rmtree(self._temp_dir)
	except OSError as e:
	logger.warning("Failed to cleanup temp directory %s: %s", self._temp_dir, e)
	self._temp_dir = None


	@dataclass
	class Isles24HuggingFaceDataset:
	"""ISLES24 dataset access optimized for HF Spaces.

	Key behavior:
	- `list_case_ids()` returns from a pinned manifest (no dataset download).
	- `get_case()` loads exactly one Parquet shard via `data_files=...` (no 27GB eager download).

	This class exists because `datasets.load_dataset(dataset_id, split="train")` can
	trigger an eager full-dataset download/prepare on cold starts, which is not viable
	for API endpoints like `/api/cases` on Hugging Face Spaces.
	"""

	dataset_id: str = ISLES24_DATASET_ID
	token: str \| None = None
	revision: str = ISLES24_DATASET_REVISION
	_temp_dir: Path \| None = field(default=None, repr=False)

	def __len__(self) -> int:
	return len(ISLES24_TRAIN_CASE_IDS)

	def __enter__(self) -> Self:
	return self

	def __exit__(self, *args: object) -> None:
	self.cleanup()

	def list_case_ids(self) -> list[str]:
	return list(ISLES24_TRAIN_CASE_IDS)

	def get_case(self, case_id: str \| int) -> CaseFiles:
	"""Load files for a single ISLES24 case.

	Args:
	case_id: Case identifier (e.g., "sub-stroke0102") or 0-based integer index.
	"""
	from datasets import load_dataset

	if isinstance(case_id, int):
	if case_id < 0 or case_id >= len(ISLES24_TRAIN_CASE_IDS):
	raise IndexError(f"Case index {case_id} out of range")
	resolved_case_id = ISLES24_TRAIN_CASE_IDS[case_id]
	else:
	resolved_case_id = case_id

	# Security: Validate subject_id before using in path (defense-in-depth)
	if not _SAFE_SUBJECT_ID_PATTERN.match(resolved_case_id):
	raise ValueError(
	f"Invalid subject_id format: {resolved_case_id!r}. Expected format: sub-strokeXXXX"
	)

	# Load exactly one shard (1 case per parquet file in this dataset)
	data_file = isles24_train_data_file(resolved_case_id)
	ds = load_dataset(
	self.dataset_id,
	data_files={"train": data_file},
	split="train",
	token=self.token,
	revision=self.revision,
	)
	ds = ds.select_columns(["subject_id", "dwi", "adc", "lesion_mask"])
	if len(ds) != 1:
	raise RuntimeError(f"Expected 1 row for {resolved_case_id}, got {len(ds)}")

	row = ds[0]
	subject_id = row["subject_id"]
	if subject_id != resolved_case_id:
	raise RuntimeError(
	f"Unexpected subject_id {subject_id!r} in {data_file} (expected {resolved_case_id!r})"
	)

	if self._temp_dir is None:
	self._temp_dir = Path(tempfile.mkdtemp(prefix="isles24_hf_wrapper_"))

	case_dir = self._temp_dir / subject_id
	case_dir.mkdir(exist_ok=True)

	dwi_path = case_dir / f"{subject_id}_dwi.nii.gz"
	adc_path = case_dir / f"{subject_id}_adc.nii.gz"

	if not dwi_path.exists():
	row["dwi"].to_filename(str(dwi_path))
	if not adc_path.exists():
	row["adc"].to_filename(str(adc_path))

	case_files: CaseFiles = {
	"dwi": dwi_path,
	"adc": adc_path,
	}

	if row.get("lesion_mask") is not None:
	mask_path = case_dir / f"{subject_id}_lesion-msk.nii.gz"
	if not mask_path.exists():
	row["lesion_mask"].to_filename(str(mask_path))
	case_files["ground_truth"] = mask_path

	return case_files

	def cleanup(self) -> None:
	if self._temp_dir and self._temp_dir.exists():
	try:
	shutil.rmtree(self._temp_dir)
	except OSError as e:
	logger.warning("Failed to cleanup temp directory %s: %s", self._temp_dir, e)
	self._temp_dir = None


	def load_isles_dataset(
	source: str \| Path \| None = None,
	*,
	local_mode: bool \| None = None,
	token: str \| None = None,
	) -> Dataset:
	"""
	Load ISLES24 dataset from local directory or HuggingFace Hub.

	Args:
	source: Local directory path or HuggingFace dataset ID.
	If None, uses Settings.hf_dataset_id from config.
	local_mode: If True, treat source as local directory.
	If None, auto-detect based on source type.
	token: HuggingFace token for private/gated datasets.
	If None, uses Settings.hf_token from config.

	Returns:
	Dataset-like object providing case access. Use as context manager
	for automatic cleanup of temp files (important for HuggingFace mode).

	Examples:
	# Load from HuggingFace with automatic cleanup (recommended)
	with load_isles_dataset() as ds:
	case = ds.get_case(0)

	# Load from local directory
	ds = load_isles_dataset("data/isles24", local_mode=True)

	# Load specific HuggingFace dataset with token
	ds = load_isles_dataset("org/private-dataset", token="hf_xxx")
	"""
	# Auto-detect mode if not specified
	if local_mode is None:
	if source is None:
	local_mode = False # Default to HuggingFace
	elif isinstance(source, Path):
	local_mode = True
	else:
	# String: check if it's an existing local path
	# Only select local mode if the path itself exists
	# (avoids misclassifying HF dataset IDs like "org/dataset")
	source_path = Path(source)
	local_mode = source_path.exists()

	if local_mode:
	from stroke_deepisles_demo.data.adapter import build_local_dataset

	if source is None:
	source = "data/isles24"
	return build_local_dataset(Path(source))

	# HuggingFace mode
	from datasets import load_dataset

	from stroke_deepisles_demo.core.config import get_settings

	settings = get_settings()

	# Use settings defaults if not specified
	dataset_id = str(source) if source else settings.hf_dataset_id
	hf_token = token if token is not None else settings.hf_token

	if dataset_id == ISLES24_DATASET_ID:
	return Isles24HuggingFaceDataset(dataset_id=dataset_id, token=hf_token)

	# Load dataset, selecting only necessary columns to minimize decoding overhead
	# We rely on neuroimaging-go-brrrr's Nifti feature for lazy loading if configured,
	# but select_columns ensures we don't touch other modalities.
	# Token enables access to private/gated datasets
	ds = load_dataset(dataset_id, split="train", token=hf_token)
	ds = ds.select_columns(["subject_id", "dwi", "adc", "lesion_mask"])

	return HuggingFaceDatasetWrapper(ds, dataset_id)