VibecoderMcSwaggins's picture
feat(data): integrate HuggingFace dataset as primary data source (#11)
363ba14 unverified
raw
history blame
3.07 kB
"""Load ISLES24 data from local directory or HuggingFace Hub."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Protocol, Self
if TYPE_CHECKING:
from stroke_deepisles_demo.core.types import CaseFiles
class Dataset(Protocol):
"""Protocol for dataset access.
All dataset implementations support context manager usage for proper cleanup:
with load_isles_dataset() as ds:
case = ds.get_case(0)
# ... process case ...
# cleanup happens automatically
"""
def __len__(self) -> int: ...
def __enter__(self) -> Self: ...
def __exit__(self, *args: object) -> None: ...
def list_case_ids(self) -> list[str]: ...
def get_case(self, case_id: str | int) -> CaseFiles: ...
def cleanup(self) -> None: ...
@dataclass
class DatasetInfo:
"""Metadata about the dataset."""
source: str # "local" or HF dataset ID
num_cases: int
modalities: list[str]
has_ground_truth: bool
# Default HuggingFace dataset ID
DEFAULT_HF_DATASET = "hugging-science/isles24-stroke"
def load_isles_dataset(
source: str | Path | None = None,
*,
local_mode: bool | None = None,
) -> Dataset:
"""
Load ISLES24 dataset from local directory or HuggingFace Hub.
Args:
source: Local directory path or HuggingFace dataset ID.
If None, uses HuggingFace dataset by default.
local_mode: If True, treat source as local directory.
If None, auto-detect based on source type.
Returns:
Dataset-like object providing case access. Use as context manager
for automatic cleanup of temp files (important for HuggingFace mode).
Examples:
# Load from HuggingFace with automatic cleanup (recommended)
with load_isles_dataset() as ds:
case = ds.get_case(0)
# Load from local directory
ds = load_isles_dataset("data/isles24", local_mode=True)
# Load specific HuggingFace dataset
ds = load_isles_dataset("hugging-science/isles24-stroke")
"""
# Auto-detect mode if not specified
if local_mode is None:
if source is None:
local_mode = False # Default to HuggingFace
elif isinstance(source, Path):
local_mode = True
else:
# String: check if it's an existing local path
# Only select local mode if the path itself exists
# (avoids misclassifying HF dataset IDs like "org/dataset")
source_path = Path(source)
local_mode = source_path.exists()
if local_mode:
from stroke_deepisles_demo.data.adapter import build_local_dataset
if source is None:
source = "data/isles24"
return build_local_dataset(Path(source))
# HuggingFace mode
from stroke_deepisles_demo.data.adapter import build_huggingface_dataset
dataset_id = source if source else DEFAULT_HF_DATASET
return build_huggingface_dataset(str(dataset_id))