Spaces:

VibecoderMcSwaggins
/

stroke-deepisles-demo

Paused

App Files Files Community

VibecoderMcSwaggins commited on Dec 13, 2025

Commit

262b3cb

unverified ·

1 Parent(s): ba32591

refactor(data): use standard datasets.load_dataset() with neuroimaging-go-brrrr

Browse files

Replaces hand-rolled HuggingFace adapter with standard datasets library using neuroimaging-go-brrrr for NIfTI support. Includes CI fixes and CodeRabbit feedback.

Files changed (8) hide show

.github/workflows/ci.yml +11 -0
src/stroke_deepisles_demo/data/adapter.py +3 -249
src/stroke_deepisles_demo/data/constants.py +0 -181
src/stroke_deepisles_demo/data/loader.py +116 -5
tests/api/test_endpoints.py +25 -19
tests/core/test_config.py +2 -1
tests/data/test_hf_adapter.py +110 -254
tests/data/test_loader.py +20 -20

.github/workflows/ci.yml CHANGED Viewed

@@ -93,6 +93,17 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Install uv
         uses: astral-sh/setup-uv@v4

     steps:
       - uses: actions/checkout@v4
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: false  # Keep false to avoid long cleanup time
+          docker-images: false
+          swap-storage: false
       - name: Install uv
         uses: astral-sh/setup-uv@v4

src/stroke_deepisles_demo/data/adapter.py CHANGED Viewed

@@ -3,13 +3,10 @@
 from __future__ import annotations
 import re
-import shutil
-import tempfile
-from dataclasses import dataclass, field
-from pathlib import Path
 from typing import TYPE_CHECKING, Self
-from stroke_deepisles_demo.core.exceptions import DataLoadError
 from stroke_deepisles_demo.core.logging import get_logger
 if TYPE_CHECKING:
@@ -24,7 +21,7 @@ logger = get_logger(__name__)
 class LocalDataset:
     """File-based dataset for local ISLES24 data.
-    Can be used as a context manager for consistency with HuggingFaceDataset,
     though no cleanup is needed for local files.
     Example:
@@ -133,246 +130,3 @@ def build_local_dataset(data_dir: Path) -> LocalDataset:
     logger.info("Loaded %d cases from %s", len(cases), data_dir)
     return LocalDataset(data_dir=data_dir, cases=cases)
-# =============================================================================
-# HuggingFace Dataset Adapter
-# =============================================================================
-@dataclass
-class HuggingFaceDataset:
-    """Dataset adapter for HuggingFace ISLES24 dataset.
-    Wraps the HuggingFace dataset and provides the same interface as LocalDataset.
-    When get_case() is called, downloads NIfTI bytes from individual parquet files
-    and writes them to temp files.
-    This implementation bypasses `load_dataset()` entirely to avoid:
-    1. PyArrow streaming bug (apache/arrow#45214) that hangs on parquet iteration
-    2. Memory issues from downloading the full 99GB dataset
-    IMPORTANT: Use as a context manager to ensure temp files are cleaned up:
-        with build_huggingface_dataset(dataset_id) as ds:
-            case = ds.get_case(0)
-            # ... process case ...
-        # temp files automatically cleaned up
-    Or call cleanup() manually when done.
-    """
-    dataset_id: str
-    _case_ids: list[str] = field(default_factory=list)
-    _case_index: dict[str, int] = field(default_factory=dict)
-    _temp_dir: Path | None = field(default=None, repr=False)
-    _cached_cases: dict[str, CaseFiles] = field(default_factory=dict, repr=False)
-    def __len__(self) -> int:
-        return len(self._case_ids)
-    def __iter__(self) -> Iterator[str]:
-        return iter(self._case_ids)
-    def __enter__(self) -> Self:
-        return self
-    def __exit__(self, *args: object) -> None:
-        self.cleanup()
-    def list_case_ids(self) -> list[str]:
-        """Return sorted list of subject IDs."""
-        return sorted(self._case_ids)
-    def get_case(self, case_id: str | int) -> CaseFiles:
-        """Get files for a case by ID or index.
-        Downloads NIfTI bytes from the individual parquet file for this case
-        and writes to temp files. Returns cached paths on subsequent calls.
-        This uses HfFileSystem + pyarrow to download only the single case (~50MB)
-        instead of the full dataset (99GB), completing in ~2 seconds.
-        Raises:
-            DataLoadError: If HuggingFace data is malformed or missing required fields.
-            KeyError: If case_id is not found in the dataset.
-        """
-        # Resolve case_id to subject_id and file index
-        if isinstance(case_id, int):
-            if case_id < 0 or case_id >= len(self._case_ids):
-                raise IndexError(f"Case index {case_id} out of range [0, {len(self._case_ids)})")
-            subject_id = self._case_ids[case_id]
-            file_idx = case_id
-        else:
-            subject_id = case_id
-            if subject_id not in self._case_index:
-                raise KeyError(f"Case ID '{subject_id}' not found in dataset")
-            file_idx = self._case_index[subject_id]
-        # Return cached case if already materialized
-        if subject_id in self._cached_cases:
-            return self._cached_cases[subject_id]
-        # Create shared temp directory on first use
-        if self._temp_dir is None:
-            self._temp_dir = Path(tempfile.mkdtemp(prefix="isles24_hf_"))
-            logger.debug("Created temp directory: %s", self._temp_dir)
-        # Download case data from individual parquet file
-        logger.info("Downloading case %s from HuggingFace...", subject_id)
-        case_data = self._download_case_from_parquet(file_idx, subject_id)
-        # Create case subdirectory
-        case_dir = self._temp_dir / subject_id
-        case_dir.mkdir(exist_ok=True)
-        # Write NIfTI files to temp directory
-        dwi_path = case_dir / f"{subject_id}_ses-02_dwi.nii.gz"
-        adc_path = case_dir / f"{subject_id}_ses-02_adc.nii.gz"
-        mask_path = case_dir / f"{subject_id}_ses-02_lesion-msk.nii.gz"
-        # Write the gzipped NIfTI bytes
-        dwi_path.write_bytes(case_data["dwi_bytes"])
-        adc_path.write_bytes(case_data["adc_bytes"])
-        case_files: CaseFiles = {
-            "dwi": dwi_path,
-            "adc": adc_path,
-        }
-        # Write lesion mask if available
-        if case_data.get("mask_bytes"):
-            mask_path.write_bytes(case_data["mask_bytes"])
-            case_files["ground_truth"] = mask_path
-        # Cache for subsequent calls
-        self._cached_cases[subject_id] = case_files
-        logger.info(
-            "Case %s ready: DWI=%.1fMB, ADC=%.1fMB",
-            subject_id,
-            len(case_data["dwi_bytes"]) / 1024 / 1024,
-            len(case_data["adc_bytes"]) / 1024 / 1024,
-        )
-        return case_files
-    def _download_case_from_parquet(self, file_idx: int, subject_id: str) -> dict[str, bytes]:
-        """Download case data directly from individual parquet file.
-        Uses HfFileSystem + pyarrow to read only the columns we need from
-        a single parquet file, avoiding the need to download the full dataset.
-        Args:
-            file_idx: Index of the parquet file (0-148)
-            subject_id: Expected subject ID (for validation)
-        Returns:
-            Dict with dwi_bytes, adc_bytes, and optionally mask_bytes
-        """
-        import pyarrow.parquet as pq
-        from huggingface_hub import HfFileSystem
-        from stroke_deepisles_demo.data.constants import ISLES24_NUM_FILES
-        # Construct path to the specific parquet file
-        fpath = f"datasets/{self.dataset_id}/data/train-{file_idx:05d}-of-{ISLES24_NUM_FILES:05d}.parquet"
-        try:
-            fs = HfFileSystem()
-            with fs.open(fpath, "rb") as f:
-                pf = pq.ParquetFile(f)
-                # Read only the columns we need
-                table = pf.read(columns=["subject_id", "dwi", "adc", "lesion_mask"])
-                df = table.to_pandas()
-                if len(df) != 1:
-                    raise DataLoadError(f"Expected 1 row in parquet file, got {len(df)}: {fpath}")
-                row = df.iloc[0]
-                # Validate subject_id matches
-                actual_subject_id = row["subject_id"]
-                if actual_subject_id != subject_id:
-                    raise DataLoadError(
-                        f"Subject ID mismatch: expected {subject_id}, got {actual_subject_id} in {fpath}"
-                    )
-                # Extract bytes with defensive error handling
-                try:
-                    dwi_bytes = row["dwi"]["bytes"]
-                    adc_bytes = row["adc"]["bytes"]
-                except (KeyError, TypeError) as e:
-                    raise DataLoadError(
-                        f"Malformed HuggingFace data for {subject_id}: missing 'dwi' or 'adc' bytes. "
-                        f"The dataset schema may have changed. Error: {e}"
-                    ) from e
-                result: dict[str, bytes] = {
-                    "dwi_bytes": dwi_bytes,
-                    "adc_bytes": adc_bytes,
-                }
-                # Extract mask if available
-                mask_data = row.get("lesion_mask")
-                if mask_data is not None and isinstance(mask_data, dict) and mask_data.get("bytes"):
-                    result["mask_bytes"] = mask_data["bytes"]
-                return result
-        except Exception as e:
-            if isinstance(e, DataLoadError):
-                raise
-            raise DataLoadError(f"Failed to download case {subject_id} from {fpath}: {e}") from e
-    def cleanup(self) -> None:
-        """Remove temp directory and clear cache."""
-        if self._temp_dir is not None and self._temp_dir.exists():
-            try:
-                shutil.rmtree(self._temp_dir)
-                logger.debug("Cleaned up temp directory: %s", self._temp_dir)
-            except OSError as e:
-                logger.warning("Failed to cleanup temp directory %s: %s", self._temp_dir, e)
-        self._temp_dir = None
-        self._cached_cases.clear()
-def build_huggingface_dataset(dataset_id: str) -> HuggingFaceDataset:
-    """
-    Build ISLES24 dataset adapter for HuggingFace Hub.
-    Uses pre-computed case IDs to avoid streaming enumeration (which hangs
-    due to PyArrow bug apache/arrow#45214). Actual data is downloaded lazily
-    from individual parquet files when get_case() is called.
-    Args:
-        dataset_id: HuggingFace dataset identifier (e.g., "hugging-science/isles24-stroke")
-    Returns:
-        HuggingFaceDataset providing case access
-    """
-    from stroke_deepisles_demo.data.constants import (
-        ISLES24_CASE_IDS,
-        ISLES24_CASE_INDEX,
-        ISLES24_DATASET_ID,
-    )
-    # Validate dataset_id matches our pre-computed constants
-    if dataset_id != ISLES24_DATASET_ID:
-        logger.warning(
-            "Dataset ID '%s' does not match pre-computed constants for '%s'. "
-            "Case IDs may be incorrect.",
-            dataset_id,
-            ISLES24_DATASET_ID,
-        )
-    logger.info(
-        "Building HuggingFace dataset adapter: %s (%d cases, pre-computed)",
-        dataset_id,
-        len(ISLES24_CASE_IDS),
-    )
-    return HuggingFaceDataset(
-        dataset_id=dataset_id,
-        _case_ids=list(ISLES24_CASE_IDS),
-        _case_index=dict(ISLES24_CASE_INDEX),
-    )

 from __future__ import annotations
 import re
+from dataclasses import dataclass
+from pathlib import Path  # noqa: TC003
 from typing import TYPE_CHECKING, Self
 from stroke_deepisles_demo.core.logging import get_logger
 if TYPE_CHECKING:
 class LocalDataset:
     """File-based dataset for local ISLES24 data.
+    Can be used as a context manager for consistency with HuggingFaceDatasetWrapper,
     though no cleanup is needed for local files.
     Example:
     logger.info("Loaded %d cases from %s", len(cases), data_dir)
     return LocalDataset(data_dir=data_dir, cases=cases)

src/stroke_deepisles_demo/data/constants.py DELETED Viewed

@@ -1,181 +0,0 @@
-"""Pre-computed constants for ISLES24 dataset.
-The ISLES24 challenge dataset is static (case IDs will never change).
-Pre-computing these values avoids:
-1. PyArrow streaming bug (apache/arrow#45214) that hangs on parquet iteration
-2. Memory issues from downloading the full 99GB dataset
-See docs/specs/08-bug-hf-spaces-dataset-loop.md for full investigation.
-"""
-# Pre-computed case IDs for ISLES24 dataset
-# Extracted via HfFileSystem enumeration on 2025-12-08
-# Order matches parquet file indices (train-00000-of-00149.parquet = index 0)
-ISLES24_CASE_IDS: tuple[str, ...] = (
-    "sub-stroke0001",
-    "sub-stroke0002",
-    "sub-stroke0003",
-    "sub-stroke0004",
-    "sub-stroke0005",
-    "sub-stroke0006",
-    "sub-stroke0007",
-    "sub-stroke0008",
-    "sub-stroke0009",
-    "sub-stroke0010",
-    "sub-stroke0011",
-    "sub-stroke0012",
-    "sub-stroke0013",
-    "sub-stroke0014",
-    "sub-stroke0015",
-    "sub-stroke0016",
-    "sub-stroke0017",
-    "sub-stroke0019",
-    "sub-stroke0020",
-    "sub-stroke0021",
-    "sub-stroke0022",
-    "sub-stroke0025",
-    "sub-stroke0026",
-    "sub-stroke0027",
-    "sub-stroke0028",
-    "sub-stroke0030",
-    "sub-stroke0033",
-    "sub-stroke0036",
-    "sub-stroke0037",
-    "sub-stroke0038",
-    "sub-stroke0040",
-    "sub-stroke0043",
-    "sub-stroke0045",
-    "sub-stroke0047",
-    "sub-stroke0048",
-    "sub-stroke0049",
-    "sub-stroke0052",
-    "sub-stroke0053",
-    "sub-stroke0054",
-    "sub-stroke0055",
-    "sub-stroke0057",
-    "sub-stroke0062",
-    "sub-stroke0066",
-    "sub-stroke0068",
-    "sub-stroke0070",
-    "sub-stroke0071",
-    "sub-stroke0073",
-    "sub-stroke0074",
-    "sub-stroke0075",
-    "sub-stroke0076",
-    "sub-stroke0077",
-    "sub-stroke0078",
-    "sub-stroke0079",
-    "sub-stroke0080",
-    "sub-stroke0081",
-    "sub-stroke0082",
-    "sub-stroke0083",
-    "sub-stroke0084",
-    "sub-stroke0085",
-    "sub-stroke0086",
-    "sub-stroke0087",
-    "sub-stroke0088",
-    "sub-stroke0089",
-    "sub-stroke0090",
-    "sub-stroke0091",
-    "sub-stroke0092",
-    "sub-stroke0093",
-    "sub-stroke0094",
-    "sub-stroke0095",
-    "sub-stroke0096",
-    "sub-stroke0097",
-    "sub-stroke0098",
-    "sub-stroke0099",
-    "sub-stroke0100",
-    "sub-stroke0101",
-    "sub-stroke0102",
-    "sub-stroke0103",
-    "sub-stroke0104",
-    "sub-stroke0105",
-    "sub-stroke0106",
-    "sub-stroke0107",
-    "sub-stroke0108",
-    "sub-stroke0109",
-    "sub-stroke0110",
-    "sub-stroke0111",
-    "sub-stroke0112",
-    "sub-stroke0113",
-    "sub-stroke0114",
-    "sub-stroke0115",
-    "sub-stroke0116",
-    "sub-stroke0117",
-    "sub-stroke0118",
-    "sub-stroke0119",
-    "sub-stroke0133",
-    "sub-stroke0134",
-    "sub-stroke0135",
-    "sub-stroke0136",
-    "sub-stroke0137",
-    "sub-stroke0138",
-    "sub-stroke0139",
-    "sub-stroke0140",
-    "sub-stroke0141",
-    "sub-stroke0142",
-    "sub-stroke0143",
-    "sub-stroke0144",
-    "sub-stroke0145",
-    "sub-stroke0146",
-    "sub-stroke0147",
-    "sub-stroke0148",
-    "sub-stroke0149",
-    "sub-stroke0150",
-    "sub-stroke0151",
-    "sub-stroke0152",
-    "sub-stroke0153",
-    "sub-stroke0154",
-    "sub-stroke0155",
-    "sub-stroke0156",
-    "sub-stroke0157",
-    "sub-stroke0158",
-    "sub-stroke0159",
-    "sub-stroke0161",
-    "sub-stroke0162",
-    "sub-stroke0163",
-    "sub-stroke0164",
-    "sub-stroke0165",
-    "sub-stroke0166",
-    "sub-stroke0167",
-    "sub-stroke0168",
-    "sub-stroke0169",
-    "sub-stroke0170",
-    "sub-stroke0171",
-    "sub-stroke0172",
-    "sub-stroke0173",
-    "sub-stroke0174",
-    "sub-stroke0175",
-    "sub-stroke0176",
-    "sub-stroke0177",
-    "sub-stroke0178",
-    "sub-stroke0179",
-    "sub-stroke0180",
-    "sub-stroke0181",
-    "sub-stroke0182",
-    "sub-stroke0183",
-    "sub-stroke0184",
-    "sub-stroke0185",
-    "sub-stroke0186",
-    "sub-stroke0187",
-    "sub-stroke0188",
-    "sub-stroke0189",
-)
-# Mapping from case ID to parquet file index (0-indexed)
-# train-00000-of-00149.parquet contains sub-stroke0001
-# train-00001-of-00149.parquet contains sub-stroke0002
-# etc.
-ISLES24_CASE_INDEX: dict[str, int] = {case_id: idx for idx, case_id in enumerate(ISLES24_CASE_IDS)}
-# Total number of parquet files in the dataset
-ISLES24_NUM_FILES: int = 149
-# Sanity check: ensure constants are consistent
-assert len(ISLES24_CASE_IDS) == ISLES24_NUM_FILES, (
-    f"ISLES24_CASE_IDS has {len(ISLES24_CASE_IDS)} entries but ISLES24_NUM_FILES is {ISLES24_NUM_FILES}"
-)
-# Dataset identifier on HuggingFace Hub
-ISLES24_DATASET_ID: str = "hugging-science/isles24-stroke"

src/stroke_deepisles_demo/data/loader.py CHANGED Viewed

@@ -2,12 +2,19 @@
 from __future__ import annotations
-from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Protocol, Self
 if TYPE_CHECKING:
-    from stroke_deepisles_demo.core.types import CaseFiles
 class Dataset(Protocol):
@@ -39,6 +46,103 @@ class DatasetInfo:
     has_ground_truth: bool
 # Default HuggingFace dataset ID
 DEFAULT_HF_DATASET = "hugging-science/isles24-stroke"
@@ -93,7 +197,14 @@ def load_isles_dataset(
         return build_local_dataset(Path(source))
     # HuggingFace mode
-    from stroke_deepisles_demo.data.adapter import build_huggingface_dataset
-    dataset_id = source if source else DEFAULT_HF_DATASET
-    return build_huggingface_dataset(str(dataset_id))

 from __future__ import annotations
+import shutil
+import tempfile
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Protocol, Self
+from stroke_deepisles_demo.core.logging import get_logger
+from stroke_deepisles_demo.core.types import CaseFiles  # noqa: TC001
 if TYPE_CHECKING:
+    from datasets import Dataset as HFDataset
+logger = get_logger(__name__)
 class Dataset(Protocol):
     has_ground_truth: bool
+@dataclass
+class HuggingFaceDatasetWrapper:
+    """Wrapper for HuggingFace dataset to match the Dataset protocol.
+    Uses the standard datasets library (with neuroimaging-go-brrrr patched Nifti feature)
+    to load data. Materializes NIfTI images to temporary files on demand.
+    """
+    dataset: HFDataset
+    dataset_id: str
+    _temp_dir: Path | None = field(default=None, repr=False)
+    _case_id_to_index: dict[str, int] = field(default_factory=dict, repr=False)
+    def __post_init__(self) -> None:
+        """Build index of subject IDs for O(1) lookup."""
+        try:
+            # Efficiently build index from 'subject_id' column
+            self._case_id_to_index = {
+                sid: idx for idx, sid in enumerate(self.dataset["subject_id"])
+            }
+        except (KeyError, TypeError, ValueError) as e:
+            logger.warning(
+                "Failed to build index from subject_id column: %s. Fallback to iteration.", e
+            )
+            for idx, item in enumerate(self.dataset):
+                self._case_id_to_index[item["subject_id"]] = idx
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def __enter__(self) -> Self:
+        return self
+    def __exit__(self, *args: object) -> None:
+        self.cleanup()
+    def list_case_ids(self) -> list[str]:
+        return sorted(self._case_id_to_index.keys())
+    def get_case(self, case_id: str | int) -> CaseFiles:
+        """Get files for a case by ID or index.
+        Materializes NIfTI objects to temporary files.
+        """
+        # Resolve case_id to index
+        if isinstance(case_id, int):
+            if case_id < 0 or case_id >= len(self.dataset):
+                raise IndexError(f"Case index {case_id} out of range")
+            idx = case_id
+        else:
+            if case_id not in self._case_id_to_index:
+                raise KeyError(f"Case ID {case_id} not found")
+            idx = self._case_id_to_index[case_id]
+        row = self.dataset[idx]
+        subject_id = row["subject_id"]
+        # Prepare temp dir
+        if self._temp_dir is None:
+            self._temp_dir = Path(tempfile.mkdtemp(prefix="isles24_hf_wrapper_"))
+        case_dir = self._temp_dir / subject_id
+        case_dir.mkdir(exist_ok=True)
+        dwi_path = case_dir / f"{subject_id}_dwi.nii.gz"
+        adc_path = case_dir / f"{subject_id}_adc.nii.gz"
+        # Materialize files if they don't exist
+        if not dwi_path.exists():
+            row["dwi"].to_filename(str(dwi_path))
+        if not adc_path.exists():
+            row["adc"].to_filename(str(adc_path))
+        case_files: CaseFiles = {
+            "dwi": dwi_path,
+            "adc": adc_path,
+        }
+        # Handle lesion mask (mapped to ground_truth)
+        if "lesion_mask" in row and row["lesion_mask"] is not None:
+            mask_path = case_dir / f"{subject_id}_lesion-msk.nii.gz"
+            if not mask_path.exists():
+                row["lesion_mask"].to_filename(str(mask_path))
+            case_files["ground_truth"] = mask_path
+        return case_files
+    def cleanup(self) -> None:
+        if self._temp_dir and self._temp_dir.exists():
+            try:
+                shutil.rmtree(self._temp_dir)
+            except OSError as e:
+                logger.warning("Failed to cleanup temp directory %s: %s", self._temp_dir, e)
+        self._temp_dir = None
 # Default HuggingFace dataset ID
 DEFAULT_HF_DATASET = "hugging-science/isles24-stroke"
         return build_local_dataset(Path(source))
     # HuggingFace mode
+    from datasets import load_dataset
+    dataset_id = str(source) if source else DEFAULT_HF_DATASET
+    # Load dataset, selecting only necessary columns to minimize decoding overhead
+    # We rely on neuroimaging-go-brrrr's Nifti feature for lazy loading if configured,
+    # but select_columns ensures we don't touch other modalities.
+    ds = load_dataset(dataset_id, split="train")
+    ds = ds.select_columns(["subject_id", "dwi", "adc", "lesion_mask"])
+    return HuggingFaceDatasetWrapper(ds, dataset_id)

tests/api/test_endpoints.py CHANGED Viewed

@@ -84,31 +84,37 @@ class TestPostSegment:
     def test_creates_job_and_returns_202(self, client: TestClient) -> None:
         """POST /api/segment creates a job and returns 202 Accepted."""
-        response = client.post(
-            "/api/segment",
-            json={"case_id": "sub-stroke0001", "fast_mode": True},
-        )
-        assert response.status_code == 202
-        data = response.json()
-        assert "jobId" in data
-        assert data["status"] == "pending"
-        assert "message" in data
     def test_returns_job_id_for_polling(self, client: TestClient) -> None:
         """POST /api/segment returns a job ID that can be used for polling."""
-        response = client.post(
-            "/api/segment",
-            json={"case_id": "sub-stroke0001", "fast_mode": True},
-        )
-        job_id = response.json()["jobId"]
-        assert job_id is not None
-        assert len(job_id) > 0
-        # Job should be retrievable via GET /api/jobs/{id}
-        status_response = client.get(f"/api/jobs/{job_id}")
-        assert status_response.status_code == 200
     def test_returns_422_on_missing_case_id(self, client: TestClient) -> None:
         """POST /api/segment returns 422 when case_id is missing."""

     def test_creates_job_and_returns_202(self, client: TestClient) -> None:
         """POST /api/segment creates a job and returns 202 Accepted."""
+        with patch("stroke_deepisles_demo.api.routes.list_case_ids") as mock_list:
+            mock_list.return_value = ["sub-stroke0001", "sub-stroke0002"]
+            response = client.post(
+                "/api/segment",
+                json={"case_id": "sub-stroke0001", "fast_mode": True},
+            )
+            assert response.status_code == 202
+            data = response.json()
+            assert "jobId" in data
+            assert data["status"] == "pending"
+            assert "message" in data
     def test_returns_job_id_for_polling(self, client: TestClient) -> None:
         """POST /api/segment returns a job ID that can be used for polling."""
+        with patch("stroke_deepisles_demo.api.routes.list_case_ids") as mock_list:
+            mock_list.return_value = ["sub-stroke0001", "sub-stroke0002"]
+            response = client.post(
+                "/api/segment",
+                json={"case_id": "sub-stroke0001", "fast_mode": True},
+            )
+            job_id = response.json()["jobId"]
+            assert job_id is not None
+            assert len(job_id) > 0
+            # Job should be retrievable via GET /api/jobs/{id}
+            status_response = client.get(f"/api/jobs/{job_id}")
+            assert status_response.status_code == 200
     def test_returns_422_on_missing_case_id(self, client: TestClient) -> None:
         """POST /api/segment returns 422 when case_id is missing."""

tests/core/test_config.py CHANGED Viewed

@@ -25,7 +25,8 @@ class TestSettings:
         assert settings.log_level == "INFO"
         assert settings.hf_dataset_id == "hugging-science/isles24-stroke"
         assert settings.deepisles_timeout_seconds == 1800
-        assert settings.results_dir == Path("./results")
     def test_env_override(self, monkeypatch: pytest.MonkeyPatch) -> None:
         """Environment variables override defaults."""

         assert settings.log_level == "INFO"
         assert settings.hf_dataset_id == "hugging-science/isles24-stroke"
         assert settings.deepisles_timeout_seconds == 1800
+        # Default is /tmp/stroke-results for HF Spaces compatibility (only /tmp is writable)
+        assert settings.results_dir == Path("/tmp/stroke-results")
     def test_env_override(self, monkeypatch: pytest.MonkeyPatch) -> None:
         """Environment variables override defaults."""

tests/data/test_hf_adapter.py CHANGED Viewed

@@ -1,295 +1,151 @@
-"""Unit tests for HuggingFace dataset adapter with mocked HF data access."""
 from __future__ import annotations
-from unittest.mock import MagicMock, patch
 import pytest
-from stroke_deepisles_demo.core.exceptions import DataLoadError
-from stroke_deepisles_demo.data.adapter import HuggingFaceDataset, build_huggingface_dataset
-class TestHuggingFaceDataset:
-    """Tests for HuggingFaceDataset class."""
-    def test_get_case_writes_files_to_temp_dir(self) -> None:
-        """Test that get_case writes NIfTI bytes to temp files."""
-        case_ids = ["sub-stroke0001", "sub-stroke0002", "sub-stroke0003"]
-        case_index = {cid: idx for idx, cid in enumerate(case_ids)}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
-        )
-        # Mock the download method
-        mock_data = {
-            "dwi_bytes": b"fake_dwi_nifti_data",
-            "adc_bytes": b"fake_adc_nifti_data",
-            "mask_bytes": b"fake_mask_nifti_data",
-        }
-        try:
-            with patch.object(ds, "_download_case_from_parquet", return_value=mock_data):
-                case = ds.get_case(0)
-                assert "dwi" in case
-                assert "adc" in case
-                assert case["dwi"].exists()
-                assert case["adc"].exists()
-                assert case["dwi"].read_bytes() == b"fake_dwi_nifti_data"
-                assert case["adc"].read_bytes() == b"fake_adc_nifti_data"
-        finally:
-            ds.cleanup()
-    def test_get_case_includes_ground_truth_when_available(self) -> None:
-        """Test that ground truth is included when lesion_mask is present."""
-        case_ids = ["sub-stroke0001", "sub-stroke0002", "sub-stroke0003"]
-        case_index = {cid: idx for idx, cid in enumerate(case_ids)}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
         )
-        try:
-            # Case with mask
-            mock_data_with_mask = {
-                "dwi_bytes": b"fake_dwi_nifti_data",
-                "adc_bytes": b"fake_adc_nifti_data",
-                "mask_bytes": b"fake_mask_nifti_data",
-            }
-            with patch.object(ds, "_download_case_from_parquet", return_value=mock_data_with_mask):
-                case = ds.get_case(0)
-                assert "ground_truth" in case
-                assert case["ground_truth"].read_bytes() == b"fake_mask_nifti_data"
-            # Case without mask
-            mock_data_no_mask = {
-                "dwi_bytes": b"fake_dwi_nifti_data",
-                "adc_bytes": b"fake_adc_nifti_data",
-            }
-            with patch.object(ds, "_download_case_from_parquet", return_value=mock_data_no_mask):
-                case_no_mask = ds.get_case(2)
-                assert "ground_truth" not in case_no_mask
-        finally:
-            ds.cleanup()
-    def test_get_case_caches_results(self) -> None:
-        """Test that get_case returns cached paths on subsequent calls."""
-        case_ids = ["sub-stroke0001", "sub-stroke0002", "sub-stroke0003"]
-        case_index = {cid: idx for idx, cid in enumerate(case_ids)}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
-        )
-        mock_data = {
-            "dwi_bytes": b"fake_dwi_nifti_data",
-            "adc_bytes": b"fake_adc_nifti_data",
         }
-        try:
-            with patch.object(
-                ds, "_download_case_from_parquet", return_value=mock_data
-            ) as mock_download:
-                case1 = ds.get_case(0)
-                case2 = ds.get_case(0)
-                # Same object returned (cached)
-                assert case1 is case2
-                # Download was only called once
-                assert mock_download.call_count == 1
-        finally:
-            ds.cleanup()
-    def test_context_manager_cleans_up_temp_files(self) -> None:
-        """Test that using context manager cleans up temp files."""
-        case_ids = ["sub-stroke0001"]
-        case_index = {"sub-stroke0001": 0}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
         )
-        mock_data = {
-            "dwi_bytes": b"fake_dwi_nifti_data",
-            "adc_bytes": b"fake_adc_nifti_data",
-        }
-        with patch.object(ds, "_download_case_from_parquet", return_value=mock_data), ds:
-            case = ds.get_case(0)
-            temp_dir = case["dwi"].parent.parent
-            assert temp_dir.exists()
-        # After context exit, temp dir should be gone
-        assert not temp_dir.exists()
-    def test_cleanup_clears_cache(self) -> None:
-        """Test that cleanup clears the case cache."""
-        case_ids = ["sub-stroke0001"]
-        case_index = {"sub-stroke0001": 0}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
-        )
-        mock_data = {
-            "dwi_bytes": b"fake_dwi_nifti_data",
-            "adc_bytes": b"fake_adc_nifti_data",
         }
-        with patch.object(ds, "_download_case_from_parquet", return_value=mock_data):
-            ds.get_case(0)
-            assert len(ds._cached_cases) == 1
-        ds.cleanup()
-        assert len(ds._cached_cases) == 0
-    def test_get_case_by_string_id(self) -> None:
-        """Test that get_case works with string case IDs."""
-        case_ids = ["sub-stroke0001", "sub-stroke0002", "sub-stroke0003"]
-        case_index = {cid: idx for idx, cid in enumerate(case_ids)}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
         )
-        mock_data = {
-            "dwi_bytes": b"fake_dwi_nifti_data",
-            "adc_bytes": b"fake_adc_nifti_data",
-        }
-        try:
-            with patch.object(
-                ds, "_download_case_from_parquet", return_value=mock_data
-            ) as mock_download:
-                case = ds.get_case("sub-stroke0002")
-                assert case["dwi"].exists()
-                # Should have been called with index 1 (second case)
-                mock_download.assert_called_once_with(1, "sub-stroke0002")
-        finally:
-            ds.cleanup()
-    def test_get_case_raises_key_error_for_invalid_id(self) -> None:
-        """Test that get_case raises KeyError for invalid case ID."""
-        case_ids = ["sub-stroke0001"]
-        case_index = {"sub-stroke0001": 0}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
-        )
-        with pytest.raises(KeyError, match="not found in dataset"):
-            ds.get_case("sub-stroke9999")
-    def test_get_case_raises_index_error_for_out_of_range(self) -> None:
-        """Test that get_case raises IndexError for out of range index."""
-        case_ids = ["sub-stroke0001"]
-        case_index = {"sub-stroke0001": 0}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
         )
-        with pytest.raises(IndexError, match="out of range"):
-            ds.get_case(99)
-class TestBuildHuggingFaceDataset:
-    """Tests for build_huggingface_dataset function."""
-    def test_uses_precomputed_case_ids(self) -> None:
-        """Test that build_huggingface_dataset uses pre-computed case IDs."""
-        result = build_huggingface_dataset("hugging-science/isles24-stroke")
-        assert isinstance(result, HuggingFaceDataset)
-        assert result.dataset_id == "hugging-science/isles24-stroke"
-        # Should have 149 cases from pre-computed list
-        assert len(result._case_ids) == 149
-        assert "sub-stroke0001" in result._case_ids
-        assert "sub-stroke0189" in result._case_ids
-    def test_case_index_mapping_is_correct(self) -> None:
-        """Test that case index mapping matches case IDs order."""
-        result = build_huggingface_dataset("hugging-science/isles24-stroke")
-        # First case should map to index 0
-        assert result._case_index["sub-stroke0001"] == 0
-        # Last case should map to index 148
-        assert result._case_index["sub-stroke0189"] == 148
-    def test_warns_for_different_dataset_id(self) -> None:
-        """Test that a warning is logged for non-standard dataset IDs."""
-        from stroke_deepisles_demo.data.adapter import logger
-        with patch.object(logger, "warning") as mock_warning:
-            build_huggingface_dataset("some-other/dataset")
-            mock_warning.assert_called_once()
-            assert "does not match pre-computed constants" in mock_warning.call_args[0][0]
-class TestDownloadCaseFromParquet:
-    """Tests for _download_case_from_parquet method."""
-    def test_raises_data_load_error_on_malformed_data(self) -> None:
-        """Test that _download_case_from_parquet raises DataLoadError for malformed data."""
-        import pandas as pd  # type: ignore[import-untyped]
-        case_ids = ["sub-stroke0001"]
-        case_index = {"sub-stroke0001": 0}
-        ds = HuggingFaceDataset(
-            dataset_id="test/dataset",
-            _case_ids=case_ids,
-            _case_index=case_index,
-        )
-        # Create mock with missing 'bytes' key
-        mock_df = pd.DataFrame(
-            [
-                {
-                    "subject_id": "sub-stroke0001",
-                    "dwi": {},  # Missing 'bytes'
-                    "adc": {},
-                    "lesion_mask": None,
-                }
-            ]
-        )
-        mock_table = MagicMock()
-        mock_table.to_pandas.return_value = mock_df
-        mock_pf = MagicMock()
-        mock_pf.read.return_value = mock_table
-        mock_file = MagicMock()
-        mock_file.__enter__ = MagicMock(return_value=mock_file)
-        mock_file.__exit__ = MagicMock(return_value=False)
-        mock_fs = MagicMock()
-        mock_fs.open.return_value = mock_file
-        # Patch at the source module where they're imported, not where they're used
-        with (
-            patch("huggingface_hub.HfFileSystem", return_value=mock_fs),
-            patch("pyarrow.parquet.ParquetFile", return_value=mock_pf),
-            pytest.raises(DataLoadError, match="Malformed HuggingFace data"),
-        ):
-            ds._download_case_from_parquet(0, "sub-stroke0001")

+"""Unit tests for HuggingFace dataset wrapper."""
 from __future__ import annotations
+from typing import Any
+from unittest.mock import MagicMock
 import pytest
+from stroke_deepisles_demo.data.loader import HuggingFaceDatasetWrapper
+class TestHuggingFaceDatasetWrapper:
+    """Tests for HuggingFaceDatasetWrapper class."""
+    @pytest.fixture
+    def mock_hf_dataset(self) -> MagicMock:
+        """Create a mock HuggingFace dataset."""
+        dataset = MagicMock()
+        # Mock dataset length
+        dataset.__len__.return_value = 3
+        # Mock column access for fast index building
+        # This simulates dataset["subject_id"]
+        dataset.__getitem__.side_effect = lambda key: (
+            ["sub-stroke0001", "sub-stroke0002", "sub-stroke0003"]
+            if key == "subject_id"
+            else MagicMock()
         )
+        return dataset
+    def test_init_builds_index_correctly(self, mock_hf_dataset: MagicMock) -> None:
+        """Test that initialization builds the subject ID index."""
+        wrapper = HuggingFaceDatasetWrapper(mock_hf_dataset, "test/dataset")
+        assert len(wrapper) == 3
+        assert wrapper.list_case_ids() == ["sub-stroke0001", "sub-stroke0002", "sub-stroke0003"]
+        assert wrapper._case_id_to_index["sub-stroke0001"] == 0
+        assert wrapper._case_id_to_index["sub-stroke0003"] == 2
+    def test_get_case_materializes_files(self, mock_hf_dataset: MagicMock) -> None:
+        """Test that get_case materializes NIfTI objects to files."""
+        # Setup row return for get_case
+        mock_dwi = MagicMock()
+        mock_adc = MagicMock()
+        mock_mask = MagicMock()
+        row_data = {
+            "subject_id": "sub-stroke0001",
+            "dwi": mock_dwi,
+            "adc": mock_adc,
+            "lesion_mask": mock_mask,
         }
+        # Reset side_effect to return row for integer index
+        mock_hf_dataset.__getitem__.side_effect = (
+            lambda idx: row_data if isinstance(idx, int) else ["sub-stroke0001"]
         )
+        wrapper = HuggingFaceDatasetWrapper(mock_hf_dataset, "test/dataset")
+        with wrapper:
+            case = wrapper.get_case("sub-stroke0001")
+            # Verify file paths
+            assert case["dwi"].name == "sub-stroke0001_dwi.nii.gz"
+            assert case["adc"].name == "sub-stroke0001_adc.nii.gz"
+            assert case["ground_truth"].name == "sub-stroke0001_lesion-msk.nii.gz"
+            # Verify to_filename called
+            mock_dwi.to_filename.assert_called_once()
+            mock_adc.to_filename.assert_called_once()
+            mock_mask.to_filename.assert_called_once()
+            # Verify temporary directory usage
+            assert wrapper._temp_dir is not None
+            assert case["dwi"].parent == wrapper._temp_dir / "sub-stroke0001"
+    def test_get_case_handles_missing_mask(self, mock_hf_dataset: MagicMock) -> None:
+        """Test that get_case handles cases without lesion mask."""
+        row_data = {
+            "subject_id": "sub-stroke0002",
+            "dwi": MagicMock(),
+            "adc": MagicMock(),
+            "lesion_mask": None,
         }
+        mock_hf_dataset.__getitem__.side_effect = (
+            lambda idx: row_data if isinstance(idx, int) else ["sub-stroke0002"]
         )
+        wrapper = HuggingFaceDatasetWrapper(mock_hf_dataset, "test/dataset")
+        with wrapper:
+            case = wrapper.get_case("sub-stroke0002")
+            assert "dwi" in case
+            assert "adc" in case
+            assert "ground_truth" not in case
+    def test_cleanup_removes_temp_dir(self, mock_hf_dataset: MagicMock) -> None:
+        """Test that cleanup removes the temporary directory."""
+        row_data = {
+            "subject_id": "sub-stroke0001",
+            "dwi": MagicMock(),
+            "adc": MagicMock(),
+            "lesion_mask": None,
+        }
+        mock_hf_dataset.__getitem__.side_effect = (
+            lambda idx: row_data if isinstance(idx, int) else ["sub-stroke0001"]
         )
+        wrapper = HuggingFaceDatasetWrapper(mock_hf_dataset, "test/dataset")
+        # Create temp dir by accessing a case
+        wrapper.get_case(0)
+        temp_dir = wrapper._temp_dir
+        assert temp_dir is not None
+        assert temp_dir.exists()
+        # cleanup
+        wrapper.cleanup()
+        assert not temp_dir.exists()
+        assert wrapper._temp_dir is None
+    def test_fallback_iteration(self) -> None:
+        """Test fallback to iteration if column access fails."""
+        dataset = MagicMock()
+        dataset.__len__.return_value = 2
+        # Configure iteration for fallback
+        dataset.__iter__.return_value = iter([{"subject_id": "sub-0"}, {"subject_id": "sub-1"}])
+        # Fail column access
+        def getitem(key: Any) -> Any:
+            if key == "subject_id":
+                raise ValueError("No column access")
+            if isinstance(key, int):
+                return {"subject_id": f"sub-{key}"}
+            return MagicMock()
+        dataset.__getitem__.side_effect = getitem
+        wrapper = HuggingFaceDatasetWrapper(dataset, "test/dataset")
+        assert wrapper._case_id_to_index["sub-0"] == 0
+        assert wrapper._case_id_to_index["sub-1"] == 1

tests/data/test_loader.py CHANGED Viewed

@@ -4,12 +4,12 @@ from __future__ import annotations
 import os
 from typing import TYPE_CHECKING
-from unittest.mock import patch
 import pytest
-from stroke_deepisles_demo.data.adapter import HuggingFaceDataset, LocalDataset, logger
-from stroke_deepisles_demo.data.loader import load_isles_dataset
 if TYPE_CHECKING:
     from pathlib import Path
@@ -35,31 +35,31 @@ def test_load_from_local_finds_all_cases(synthetic_isles_dir: Path) -> None:
     assert dataset.list_case_ids() == ["sub-stroke0001", "sub-stroke0002"]
-def test_load_hf_warns_on_non_standard_dataset() -> None:
-    """Test that loading a non-standard HF dataset logs a warning.
-    Note: With pre-computed case IDs, the dataset ID mismatch is only detected
-    at build time (warning logged), not at get_case() time. The actual 404 error
-    would only occur when trying to download a case that doesn't exist.
-    """
-    with patch.object(logger, "warning") as mock_warning:
-        ds = load_isles_dataset(source="fake/nonexistent-dataset", local_mode=False)
-        mock_warning.assert_called_once()
-        assert "does not match pre-computed constants" in mock_warning.call_args[0][0]
-        # Dataset is still created with pre-computed case IDs
-        assert isinstance(ds, HuggingFaceDataset)
-        assert len(ds) == 149  # Uses pre-computed list
 @pytest.mark.integration
 @SKIP_IN_CI
 def test_load_from_huggingface_returns_hf_dataset() -> None:
-    """Test that loading from HuggingFace returns a HuggingFaceDataset.
     Note: Skipped in CI due to large download size (~GB) and limited disk space.
     Run locally with: pytest -m integration tests/data/test_loader.py
     """
     with load_isles_dataset() as dataset:  # Default is HuggingFace mode
-        assert isinstance(dataset, HuggingFaceDataset)
-        assert len(dataset) == 149
-        assert dataset.list_case_ids()[0] == "sub-stroke0001"

 import os
 from typing import TYPE_CHECKING
+from unittest.mock import MagicMock, patch
 import pytest
+from stroke_deepisles_demo.data.adapter import LocalDataset
+from stroke_deepisles_demo.data.loader import HuggingFaceDatasetWrapper, load_isles_dataset
 if TYPE_CHECKING:
     from pathlib import Path
     assert dataset.list_case_ids() == ["sub-stroke0001", "sub-stroke0002"]
+def test_load_hf_calls_load_dataset() -> None:
+    """Test that loading from HF calls datasets.load_dataset."""
+    with patch("datasets.load_dataset") as mock_load:
+        mock_ds = MagicMock()
+        mock_ds.__len__.return_value = 0
+        # Mock column access for index building
+        mock_ds.__getitem__.side_effect = lambda key: [] if key == "subject_id" else MagicMock()
+        mock_load.return_value = mock_ds
+        ds = load_isles_dataset(source="my/dataset", local_mode=False)
+        assert isinstance(ds, HuggingFaceDatasetWrapper)
+        mock_load.assert_called_once()
+        assert mock_load.call_args[0][0] == "my/dataset"
 @pytest.mark.integration
 @SKIP_IN_CI
 def test_load_from_huggingface_returns_hf_dataset() -> None:
+    """Test that loading from HuggingFace returns a HuggingFaceDatasetWrapper.
     Note: Skipped in CI due to large download size (~GB) and limited disk space.
     Run locally with: pytest -m integration tests/data/test_loader.py
     """
     with load_isles_dataset() as dataset:  # Default is HuggingFace mode
+        assert isinstance(dataset, HuggingFaceDatasetWrapper)
+        # We can't guarantee length if we don't mock, but we can check type
+        # Real test might fail if network issue or auth issue