VibecoderMcSwaggins's picture
fix(data): bypass load_dataset() to fix HF Spaces streaming hang and OOM (#16)
80cbb1a unverified
raw
history blame
4.52 kB
"""Pre-computed constants for ISLES24 dataset.
The ISLES24 challenge dataset is static (case IDs will never change).
Pre-computing these values avoids:
1. PyArrow streaming bug (apache/arrow#45214) that hangs on parquet iteration
2. Memory issues from downloading the full 99GB dataset
See docs/specs/08-bug-hf-spaces-dataset-loop.md for full investigation.
"""
# Pre-computed case IDs for ISLES24 dataset
# Extracted via HfFileSystem enumeration on 2025-12-08
# Order matches parquet file indices (train-00000-of-00149.parquet = index 0)
ISLES24_CASE_IDS: tuple[str, ...] = (
"sub-stroke0001",
"sub-stroke0002",
"sub-stroke0003",
"sub-stroke0004",
"sub-stroke0005",
"sub-stroke0006",
"sub-stroke0007",
"sub-stroke0008",
"sub-stroke0009",
"sub-stroke0010",
"sub-stroke0011",
"sub-stroke0012",
"sub-stroke0013",
"sub-stroke0014",
"sub-stroke0015",
"sub-stroke0016",
"sub-stroke0017",
"sub-stroke0019",
"sub-stroke0020",
"sub-stroke0021",
"sub-stroke0022",
"sub-stroke0025",
"sub-stroke0026",
"sub-stroke0027",
"sub-stroke0028",
"sub-stroke0030",
"sub-stroke0033",
"sub-stroke0036",
"sub-stroke0037",
"sub-stroke0038",
"sub-stroke0040",
"sub-stroke0043",
"sub-stroke0045",
"sub-stroke0047",
"sub-stroke0048",
"sub-stroke0049",
"sub-stroke0052",
"sub-stroke0053",
"sub-stroke0054",
"sub-stroke0055",
"sub-stroke0057",
"sub-stroke0062",
"sub-stroke0066",
"sub-stroke0068",
"sub-stroke0070",
"sub-stroke0071",
"sub-stroke0073",
"sub-stroke0074",
"sub-stroke0075",
"sub-stroke0076",
"sub-stroke0077",
"sub-stroke0078",
"sub-stroke0079",
"sub-stroke0080",
"sub-stroke0081",
"sub-stroke0082",
"sub-stroke0083",
"sub-stroke0084",
"sub-stroke0085",
"sub-stroke0086",
"sub-stroke0087",
"sub-stroke0088",
"sub-stroke0089",
"sub-stroke0090",
"sub-stroke0091",
"sub-stroke0092",
"sub-stroke0093",
"sub-stroke0094",
"sub-stroke0095",
"sub-stroke0096",
"sub-stroke0097",
"sub-stroke0098",
"sub-stroke0099",
"sub-stroke0100",
"sub-stroke0101",
"sub-stroke0102",
"sub-stroke0103",
"sub-stroke0104",
"sub-stroke0105",
"sub-stroke0106",
"sub-stroke0107",
"sub-stroke0108",
"sub-stroke0109",
"sub-stroke0110",
"sub-stroke0111",
"sub-stroke0112",
"sub-stroke0113",
"sub-stroke0114",
"sub-stroke0115",
"sub-stroke0116",
"sub-stroke0117",
"sub-stroke0118",
"sub-stroke0119",
"sub-stroke0133",
"sub-stroke0134",
"sub-stroke0135",
"sub-stroke0136",
"sub-stroke0137",
"sub-stroke0138",
"sub-stroke0139",
"sub-stroke0140",
"sub-stroke0141",
"sub-stroke0142",
"sub-stroke0143",
"sub-stroke0144",
"sub-stroke0145",
"sub-stroke0146",
"sub-stroke0147",
"sub-stroke0148",
"sub-stroke0149",
"sub-stroke0150",
"sub-stroke0151",
"sub-stroke0152",
"sub-stroke0153",
"sub-stroke0154",
"sub-stroke0155",
"sub-stroke0156",
"sub-stroke0157",
"sub-stroke0158",
"sub-stroke0159",
"sub-stroke0161",
"sub-stroke0162",
"sub-stroke0163",
"sub-stroke0164",
"sub-stroke0165",
"sub-stroke0166",
"sub-stroke0167",
"sub-stroke0168",
"sub-stroke0169",
"sub-stroke0170",
"sub-stroke0171",
"sub-stroke0172",
"sub-stroke0173",
"sub-stroke0174",
"sub-stroke0175",
"sub-stroke0176",
"sub-stroke0177",
"sub-stroke0178",
"sub-stroke0179",
"sub-stroke0180",
"sub-stroke0181",
"sub-stroke0182",
"sub-stroke0183",
"sub-stroke0184",
"sub-stroke0185",
"sub-stroke0186",
"sub-stroke0187",
"sub-stroke0188",
"sub-stroke0189",
)
# Mapping from case ID to parquet file index (0-indexed)
# train-00000-of-00149.parquet contains sub-stroke0001
# train-00001-of-00149.parquet contains sub-stroke0002
# etc.
ISLES24_CASE_INDEX: dict[str, int] = {case_id: idx for idx, case_id in enumerate(ISLES24_CASE_IDS)}
# Total number of parquet files in the dataset
ISLES24_NUM_FILES: int = 149
# Sanity check: ensure constants are consistent
assert len(ISLES24_CASE_IDS) == ISLES24_NUM_FILES, (
f"ISLES24_CASE_IDS has {len(ISLES24_CASE_IDS)} entries but ISLES24_NUM_FILES is {ISLES24_NUM_FILES}"
)
# Dataset identifier on HuggingFace Hub
ISLES24_DATASET_ID: str = "hugging-science/isles24-stroke"