| # phase 1: data access / hf integration | |
| ## purpose | |
| Implement the data loading layer that consumes ISLES24-MR-Lite from HuggingFace Hub. At the end of this phase, we can load any case by ID and get local paths to DWI, ADC, and ground truth NIfTI files. | |
| ## deliverables | |
| - [ ] `src/stroke_deepisles_demo/data/loader.py` - HF dataset loading | |
| - [ ] `src/stroke_deepisles_demo/data/adapter.py` - Case adapter for file access | |
| - [ ] `src/stroke_deepisles_demo/data/staging.py` - Stage files for DeepISLES | |
| - [ ] Unit tests with fixtures (no network required) | |
| - [ ] Integration test (marked, requires network) | |
| ## vertical slice outcome | |
| After this phase, you can run: | |
| ```python | |
| from stroke_deepisles_demo.data import get_case, list_case_ids | |
| # List available cases | |
| case_ids = list_case_ids() | |
| print(f"Found {len(case_ids)} cases") | |
| # Load a specific case | |
| case = get_case("sub-001") | |
| print(f"DWI: {case.dwi}") | |
| print(f"ADC: {case.adc}") | |
| print(f"Ground truth: {case.ground_truth}") | |
| ``` | |
| ## module structure | |
| ``` | |
| src/stroke_deepisles_demo/data/ | |
| βββ __init__.py # Public API exports | |
| βββ loader.py # HF Hub dataset loading | |
| βββ adapter.py # Case adapter (index β files) | |
| βββ staging.py # Stage files with DeepISLES naming | |
| ``` | |
| ## interfaces and types | |
| ### `data/loader.py` | |
| ```python | |
| """Load ISLES24-MR-Lite dataset from HuggingFace Hub.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import TYPE_CHECKING | |
| if TYPE_CHECKING: | |
| from datasets import Dataset | |
| def load_isles_dataset( | |
| dataset_id: str = "YongchengYAO/ISLES24-MR-Lite", | |
| *, | |
| cache_dir: Path | None = None, | |
| streaming: bool = False, | |
| ) -> Dataset: | |
| """ | |
| Load the ISLES24-MR-Lite dataset from HuggingFace Hub. | |
| Args: | |
| dataset_id: HuggingFace dataset identifier | |
| cache_dir: Local cache directory (uses HF default if None) | |
| streaming: If True, use streaming mode (lazy loading) | |
| Returns: | |
| HuggingFace Dataset object with BIDS/NIfTI support | |
| Raises: | |
| DataLoadError: If dataset cannot be loaded | |
| """ | |
| ... | |
| def get_dataset_info(dataset_id: str = "YongchengYAO/ISLES24-MR-Lite") -> DatasetInfo: | |
| """ | |
| Get metadata about the dataset without downloading. | |
| Returns: | |
| DatasetInfo with case count, available modalities, etc. | |
| """ | |
| ... | |
| @dataclass | |
| class DatasetInfo: | |
| """Metadata about the loaded dataset.""" | |
| dataset_id: str | |
| num_cases: int | |
| modalities: list[str] # e.g., ["dwi", "adc", "mask"] | |
| has_ground_truth: bool | |
| ``` | |
| ### `data/adapter.py` | |
| ```python | |
| """Adapt HF dataset rows to typed file references.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Iterator | |
| from stroke_deepisles_demo.core.types import CaseFiles | |
| class CaseAdapter: | |
| """ | |
| Adapts HuggingFace dataset to provide typed access to case files. | |
| This handles the mapping between HF dataset structure and our | |
| internal CaseFiles type. | |
| """ | |
| def __init__(self, dataset: Dataset) -> None: | |
| """ | |
| Initialize adapter with a loaded dataset. | |
| Args: | |
| dataset: HuggingFace Dataset with NIfTI files | |
| """ | |
| ... | |
| def __len__(self) -> int: | |
| """Return number of cases in the dataset.""" | |
| ... | |
| def __iter__(self) -> Iterator[str]: | |
| """Iterate over case IDs.""" | |
| ... | |
| def list_case_ids(self) -> list[str]: | |
| """ | |
| List all available case identifiers. | |
| Returns: | |
| List of case IDs (e.g., ["sub-001", "sub-002", ...]) | |
| """ | |
| ... | |
| def get_case(self, case_id: str | int) -> CaseFiles: | |
| """ | |
| Get file paths for a specific case. | |
| Args: | |
| case_id: Either a string ID (e.g., "sub-001") or integer index | |
| Returns: | |
| CaseFiles with paths to DWI, ADC, and optionally ground truth | |
| Raises: | |
| KeyError: If case_id not found | |
| DataLoadError: If files cannot be accessed | |
| """ | |
| ... | |
| def get_case_by_index(self, index: int) -> tuple[str, CaseFiles]: | |
| """ | |
| Get case by numerical index. | |
| Returns: | |
| Tuple of (case_id, CaseFiles) | |
| """ | |
| ... | |
| ``` | |
| ### `data/staging.py` | |
| ```python | |
| """Stage NIfTI files with DeepISLES-expected naming.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import NamedTuple | |
| from stroke_deepisles_demo.core.types import CaseFiles | |
| class StagedCase(NamedTuple): | |
| """Paths to staged files ready for DeepISLES.""" | |
| input_dir: Path # Directory containing staged files | |
| dwi_path: Path # Path to dwi.nii.gz | |
| adc_path: Path # Path to adc.nii.gz | |
| flair_path: Path | None # Path to flair.nii.gz if available | |
| def stage_case_for_deepisles( | |
| case_files: CaseFiles, | |
| output_dir: Path, | |
| *, | |
| case_id: str | None = None, | |
| ) -> StagedCase: | |
| """ | |
| Stage case files with DeepISLES-expected naming convention. | |
| DeepISLES expects files named exactly: | |
| - dwi.nii.gz | |
| - adc.nii.gz | |
| - flair.nii.gz (optional) | |
| This function copies/symlinks the source files to a staging directory | |
| with the correct names. | |
| Args: | |
| case_files: Source file paths from CaseAdapter | |
| output_dir: Directory to stage files into | |
| case_id: Optional case ID for logging/subdirectory | |
| Returns: | |
| StagedCase with paths to staged files | |
| Raises: | |
| MissingInputError: If required files (DWI, ADC) are missing | |
| OSError: If file operations fail | |
| """ | |
| ... | |
| def create_staging_directory(base_dir: Path | None = None) -> Path: | |
| """ | |
| Create a temporary staging directory. | |
| Args: | |
| base_dir: Parent directory (uses system temp if None) | |
| Returns: | |
| Path to created staging directory | |
| """ | |
| ... | |
| ``` | |
| ### `data/__init__.py` (public API) | |
| ```python | |
| """Data loading and case management for stroke-deepisles-demo.""" | |
| from stroke_deepisles_demo.data.adapter import CaseAdapter | |
| from stroke_deepisles_demo.data.loader import DatasetInfo, get_dataset_info, load_isles_dataset | |
| from stroke_deepisles_demo.data.staging import StagedCase, stage_case_for_deepisles | |
| __all__ = [ | |
| # Loader | |
| "load_isles_dataset", | |
| "get_dataset_info", | |
| "DatasetInfo", | |
| # Adapter | |
| "CaseAdapter", | |
| # Staging | |
| "stage_case_for_deepisles", | |
| "StagedCase", | |
| ] | |
| # Convenience functions (combine loader + adapter) | |
| def get_case(case_id: str | int) -> CaseFiles: | |
| """Load a single case by ID or index.""" | |
| ... | |
| def list_case_ids() -> list[str]: | |
| """List all available case IDs.""" | |
| ... | |
| ``` | |
| ## tdd plan | |
| ### test file structure | |
| ``` | |
| tests/ | |
| βββ conftest.py # Shared fixtures | |
| βββ data/ | |
| β βββ __init__.py | |
| β βββ test_loader.py # Tests for HF loading | |
| β βββ test_adapter.py # Tests for case adapter | |
| β βββ test_staging.py # Tests for file staging | |
| βββ fixtures/ | |
| βββ nifti/ # Minimal synthetic NIfTI files | |
| βββ dwi.nii.gz | |
| βββ adc.nii.gz | |
| βββ mask.nii.gz | |
| ``` | |
| ### tests to write first (TDD order) | |
| #### 1. `tests/conftest.py` - Fixtures | |
| ```python | |
| """Shared test fixtures.""" | |
| from __future__ import annotations | |
| import tempfile | |
| from pathlib import Path | |
| import nibabel as nib | |
| import numpy as np | |
| import pytest | |
| @pytest.fixture | |
| def temp_dir() -> Path: | |
| """Create a temporary directory for test outputs.""" | |
| with tempfile.TemporaryDirectory() as td: | |
| yield Path(td) | |
| @pytest.fixture | |
| def synthetic_nifti_3d(temp_dir: Path) -> Path: | |
| """Create a minimal synthetic 3D NIfTI file.""" | |
| data = np.random.rand(10, 10, 10).astype(np.float32) | |
| img = nib.Nifti1Image(data, affine=np.eye(4)) | |
| path = temp_dir / "synthetic.nii.gz" | |
| nib.save(img, path) | |
| return path | |
| @pytest.fixture | |
| def synthetic_case_files(temp_dir: Path) -> CaseFiles: | |
| """Create a complete set of synthetic case files.""" | |
| # Create DWI | |
| dwi_data = np.random.rand(64, 64, 30).astype(np.float32) | |
| dwi_img = nib.Nifti1Image(dwi_data, affine=np.eye(4)) | |
| dwi_path = temp_dir / "dwi.nii.gz" | |
| nib.save(dwi_img, dwi_path) | |
| # Create ADC | |
| adc_data = np.random.rand(64, 64, 30).astype(np.float32) * 2000 | |
| adc_img = nib.Nifti1Image(adc_data, affine=np.eye(4)) | |
| adc_path = temp_dir / "adc.nii.gz" | |
| nib.save(adc_img, adc_path) | |
| # Create mask | |
| mask_data = (np.random.rand(64, 64, 30) > 0.9).astype(np.uint8) | |
| mask_img = nib.Nifti1Image(mask_data, affine=np.eye(4)) | |
| mask_path = temp_dir / "mask.nii.gz" | |
| nib.save(mask_img, mask_path) | |
| return CaseFiles( | |
| dwi=dwi_path, | |
| adc=adc_path, | |
| flair=None, | |
| ground_truth=mask_path, | |
| ) | |
| @pytest.fixture | |
| def mock_hf_dataset(synthetic_case_files: CaseFiles): | |
| """Create a mock HF Dataset-like object.""" | |
| # Returns a simple dict-based mock that mimics Dataset behavior | |
| ... | |
| ``` | |
| #### 2. `tests/data/test_staging.py` - Start with staging (no network) | |
| ```python | |
| """Tests for data staging module.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import pytest | |
| from stroke_deepisles_demo.core.exceptions import MissingInputError | |
| from stroke_deepisles_demo.core.types import CaseFiles | |
| from stroke_deepisles_demo.data.staging import ( | |
| StagedCase, | |
| create_staging_directory, | |
| stage_case_for_deepisles, | |
| ) | |
| class TestCreateStagingDirectory: | |
| """Tests for create_staging_directory.""" | |
| def test_creates_directory(self, temp_dir: Path) -> None: | |
| """Staging directory is created and exists.""" | |
| staging = create_staging_directory(base_dir=temp_dir) | |
| assert staging.exists() | |
| assert staging.is_dir() | |
| def test_uses_system_temp_when_no_base(self) -> None: | |
| """Uses system temp directory when base_dir is None.""" | |
| staging = create_staging_directory(base_dir=None) | |
| assert staging.exists() | |
| # Cleanup | |
| staging.rmdir() | |
| class TestStageCaseForDeepIsles: | |
| """Tests for stage_case_for_deepisles.""" | |
| def test_stages_required_files( | |
| self, synthetic_case_files: CaseFiles, temp_dir: Path | |
| ) -> None: | |
| """DWI and ADC are staged with correct names.""" | |
| staged = stage_case_for_deepisles(synthetic_case_files, temp_dir) | |
| assert staged.dwi_path.name == "dwi.nii.gz" | |
| assert staged.adc_path.name == "adc.nii.gz" | |
| assert staged.dwi_path.exists() | |
| assert staged.adc_path.exists() | |
| def test_staged_files_are_readable( | |
| self, synthetic_case_files: CaseFiles, temp_dir: Path | |
| ) -> None: | |
| """Staged files can be read as valid NIfTI.""" | |
| import nibabel as nib | |
| staged = stage_case_for_deepisles(synthetic_case_files, temp_dir) | |
| dwi = nib.load(staged.dwi_path) | |
| assert dwi.shape == (64, 64, 30) | |
| def test_raises_when_dwi_missing(self, temp_dir: Path) -> None: | |
| """Raises MissingInputError when DWI is missing.""" | |
| case_files = CaseFiles( | |
| dwi=temp_dir / "nonexistent.nii.gz", | |
| adc=temp_dir / "adc.nii.gz", | |
| flair=None, | |
| ground_truth=None, | |
| ) | |
| with pytest.raises(MissingInputError, match="DWI"): | |
| stage_case_for_deepisles(case_files, temp_dir) | |
| def test_flair_is_optional( | |
| self, synthetic_case_files: CaseFiles, temp_dir: Path | |
| ) -> None: | |
| """Staging succeeds when FLAIR is None.""" | |
| # synthetic_case_files has flair=None | |
| staged = stage_case_for_deepisles(synthetic_case_files, temp_dir) | |
| assert staged.flair_path is None | |
| ``` | |
| #### 3. `tests/data/test_adapter.py` - Case adapter with mocks | |
| ```python | |
| """Tests for case adapter module.""" | |
| from __future__ import annotations | |
| import pytest | |
| from stroke_deepisles_demo.core.types import CaseFiles | |
| from stroke_deepisles_demo.data.adapter import CaseAdapter | |
| class TestCaseAdapter: | |
| """Tests for CaseAdapter.""" | |
| def test_list_case_ids_returns_strings(self, mock_hf_dataset) -> None: | |
| """list_case_ids returns list of string identifiers.""" | |
| adapter = CaseAdapter(mock_hf_dataset) | |
| case_ids = adapter.list_case_ids() | |
| assert isinstance(case_ids, list) | |
| assert all(isinstance(cid, str) for cid in case_ids) | |
| def test_len_matches_dataset_size(self, mock_hf_dataset) -> None: | |
| """len(adapter) equals number of cases in dataset.""" | |
| adapter = CaseAdapter(mock_hf_dataset) | |
| assert len(adapter) == len(mock_hf_dataset) | |
| def test_get_case_by_string_id(self, mock_hf_dataset) -> None: | |
| """Can retrieve case by string identifier.""" | |
| adapter = CaseAdapter(mock_hf_dataset) | |
| case_ids = adapter.list_case_ids() | |
| case = adapter.get_case(case_ids[0]) | |
| assert isinstance(case, dict) # CaseFiles is a TypedDict | |
| assert "dwi" in case | |
| assert "adc" in case | |
| def test_get_case_by_index(self, mock_hf_dataset) -> None: | |
| """Can retrieve case by integer index.""" | |
| adapter = CaseAdapter(mock_hf_dataset) | |
| case_id, case = adapter.get_case_by_index(0) | |
| assert isinstance(case_id, str) | |
| assert case["dwi"] is not None | |
| def test_get_case_invalid_id_raises(self, mock_hf_dataset) -> None: | |
| """Raises KeyError for invalid case ID.""" | |
| adapter = CaseAdapter(mock_hf_dataset) | |
| with pytest.raises(KeyError): | |
| adapter.get_case("nonexistent-case-id") | |
| def test_iteration(self, mock_hf_dataset) -> None: | |
| """Can iterate over case IDs.""" | |
| adapter = CaseAdapter(mock_hf_dataset) | |
| case_ids = list(adapter) | |
| assert len(case_ids) == len(adapter) | |
| ``` | |
| #### 4. `tests/data/test_loader.py` - Loader with network mocks | |
| ```python | |
| """Tests for data loader module.""" | |
| from __future__ import annotations | |
| from unittest.mock import MagicMock, patch | |
| import pytest | |
| from stroke_deepisles_demo.core.exceptions import DataLoadError | |
| from stroke_deepisles_demo.data.loader import ( | |
| DatasetInfo, | |
| get_dataset_info, | |
| load_isles_dataset, | |
| ) | |
| class TestLoadIslesDataset: | |
| """Tests for load_isles_dataset.""" | |
| def test_calls_hf_load_dataset(self) -> None: | |
| """Calls datasets.load_dataset with correct arguments.""" | |
| with patch("stroke_deepisles_demo.data.loader.load_dataset") as mock_load: | |
| mock_load.return_value = MagicMock() | |
| load_isles_dataset("test/dataset") | |
| mock_load.assert_called_once() | |
| call_args = mock_load.call_args | |
| assert call_args.args[0] == "test/dataset" | |
| def test_returns_dataset_object(self) -> None: | |
| """Returns the loaded Dataset object.""" | |
| with patch("stroke_deepisles_demo.data.loader.load_dataset") as mock_load: | |
| expected = MagicMock() | |
| mock_load.return_value = expected | |
| result = load_isles_dataset() | |
| assert result is expected | |
| def test_handles_load_error(self) -> None: | |
| """Wraps HF errors in DataLoadError.""" | |
| with patch("stroke_deepisles_demo.data.loader.load_dataset") as mock_load: | |
| mock_load.side_effect = Exception("Network error") | |
| with pytest.raises(DataLoadError, match="Network error"): | |
| load_isles_dataset() | |
| class TestGetDatasetInfo: | |
| """Tests for get_dataset_info.""" | |
| def test_returns_datasetinfo(self) -> None: | |
| """Returns DatasetInfo with expected fields.""" | |
| with patch("stroke_deepisles_demo.data.loader.load_dataset") as mock_load: | |
| mock_ds = MagicMock() | |
| mock_ds.__len__ = MagicMock(return_value=149) | |
| mock_ds.features = {"dwi": ..., "adc": ..., "mask": ...} | |
| mock_load.return_value = mock_ds | |
| info = get_dataset_info() | |
| assert isinstance(info, DatasetInfo) | |
| assert info.num_cases == 149 | |
| @pytest.mark.integration | |
| class TestLoadIslesDatasetIntegration: | |
| """Integration tests that hit the real HuggingFace Hub.""" | |
| @pytest.mark.slow | |
| def test_load_real_dataset(self) -> None: | |
| """Actually loads ISLES24-MR-Lite from HF Hub.""" | |
| # This test requires network access | |
| # Run with: pytest -m integration | |
| dataset = load_isles_dataset(streaming=True) | |
| # Just verify we got something | |
| assert dataset is not None | |
| ``` | |
| ### what to mock | |
| - `datasets.load_dataset` - Mock for unit tests, real for integration tests | |
| - `huggingface_hub` calls - Mock for unit tests | |
| - File system operations - Use `temp_dir` fixture with real files | |
| ### what to test for real | |
| - NIfTI file creation/reading with nibabel | |
| - File staging (copy/symlink operations) | |
| - Integration test: actual HF Hub download (marked `@pytest.mark.integration`) | |
| ## "done" criteria | |
| Phase 1 is complete when: | |
| 1. All unit tests pass: `uv run pytest tests/data/ -v` | |
| 2. Can load synthetic test cases without network | |
| 3. Can list case IDs from mock dataset | |
| 4. Can stage files with correct DeepISLES naming | |
| 5. Integration test passes (with network): `uv run pytest -m integration` | |
| 6. Type checking passes: `uv run mypy src/stroke_deepisles_demo/data/` | |
| 7. Code coverage for data module > 80% | |
| ## implementation notes | |
| - ISLES24-MR-Lite structure needs investigation - check HF page for exact column names | |
| - Consider using `huggingface_hub.snapshot_download` if `datasets.load_dataset` has issues with NIfTI | |
| - Staging can use symlinks on Unix, copies on Windows | |
| - Cache the HF dataset locally to avoid repeated downloads | |
| ### critical: streaming mode + docker materialization | |
| **Reviewer feedback (valid)**: When using `streaming=True`, the dataset returns URLs or lazy file objects, NOT local POSIX paths. Docker requires physical files on the host disk for volume mounting. | |
| **Solution**: The `stage_case_for_deepisles` function MUST handle materialization: | |
| ```python | |
| def stage_case_for_deepisles( | |
| case_files: CaseFiles, | |
| output_dir: Path, | |
| *, | |
| case_id: str | None = None, | |
| ) -> StagedCase: | |
| """ | |
| Stage case files with DeepISLES-expected naming. | |
| IMPORTANT: This function handles both local paths and streaming data. | |
| When files come from streaming mode, they must be downloaded/materialized | |
| before Docker can mount them. | |
| """ | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Handle DWI - may be Path, URL, or NIfTI object | |
| dwi_staged = output_dir / "dwi.nii.gz" | |
| _materialize_nifti(case_files["dwi"], dwi_staged) | |
| # Handle ADC | |
| adc_staged = output_dir / "adc.nii.gz" | |
| _materialize_nifti(case_files["adc"], adc_staged) | |
| # ... etc | |
| def _materialize_nifti(source: Path | str | bytes | NiftiImage, dest: Path) -> None: | |
| """ | |
| Materialize a NIfTI file to a local path. | |
| Handles: | |
| - Local Path: copy or symlink | |
| - URL string: download | |
| - bytes: write directly | |
| - NIfTI object: serialize with nibabel | |
| """ | |
| if isinstance(source, Path) and source.exists(): | |
| # Local file - symlink if possible, copy otherwise | |
| shutil.copy2(source, dest) | |
| elif isinstance(source, str) and source.startswith(("http://", "https://")): | |
| # URL - download | |
| _download_file(source, dest) | |
| elif isinstance(source, bytes): | |
| # Raw bytes | |
| dest.write_bytes(source) | |
| elif hasattr(source, "to_bytes"): | |
| # NIfTI object (nibabel or wrapper) | |
| dest.write_bytes(source.to_bytes()) | |
| else: | |
| raise MissingInputError(f"Cannot materialize source: {type(source)}") | |
| ``` | |
| This ensures Docker always gets physical files regardless of how data was loaded. | |
| ## dependencies to add | |
| No new dependencies needed - all specified in Phase 0: | |
| - `datasets` (Tobias fork) | |
| - `nibabel` | |
| - `numpy` | |