Spaces:

VibecoderMcSwaggins
/

stroke-deepisles-demo

Paused

VibecoderMcSwaggins commited on 6 days ago

Commit

878d2e7

1 Parent(s): 363ba14

fix(pipeline): prevent unbounded disk usage from HuggingFace temp files

CRITICAL FIX for HuggingFace Spaces deployment.

Problem:
run_pipeline_on_case() created a HuggingFaceDataset instance that wrote
NIfTI files (~50-100MB) to temp directories on each call. These temp
files were never cleaned up because the dataset's cleanup() method
was never called. This would rapidly fill disk quota on HF Spaces.

Solution:
- Use context manager for dataset: `with load_isles_dataset() as dataset:`
- Copy ground_truth to results_dir before dataset cleanup (needed for
Dice computation after context exit)
- Dataset temp files are now automatically cleaned up via __exit__

Note: PipelineResult.input_files paths may be invalid after function
returns in HuggingFace mode (temp files deleted). This is acceptable
since the important outputs (prediction_mask, ground_truth copy) are
in results_dir which is preserved.

Test updates:
- Updated mocks to support context manager protocol
- Use real temp file for ground_truth in tests that need file copy

All 125 tests pass, ruff and mypy clean.

Files changed (3) hide show

src/stroke_deepisles_demo/pipeline.py +40 -35
tests/test_pipeline.py +10 -4
tests/test_pipeline_cleanup.py +6 -2

src/stroke_deepisles_demo/pipeline.py CHANGED Viewed

@@ -81,38 +81,45 @@ def run_pipeline_on_case(
     start_time = time.time()
-    # 1. Load Dataset
-    dataset = load_isles_dataset()  # Uses default local path for now
-    # Resolve ID if integer
-    if isinstance(case_id, int):
-        all_ids = dataset.list_case_ids()
-        if case_id < 0 or case_id >= len(all_ids):
-            raise IndexError(f"Case index {case_id} out of range (0-{len(all_ids) - 1})")
-        resolved_case_id = all_ids[case_id]
-    else:
-        resolved_case_id = case_id
-    # Get case files
-    case_files = dataset.get_case(resolved_case_id)
-    # 2. Stage Files
-    # Use a temp dir for staging if output_dir not provided, or a subdir of output_dir
-    if output_dir:
-        output_dir = Path(output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-        staging_root = output_dir / "staging" / resolved_case_id
-        results_dir = output_dir / resolved_case_id
-    else:
-        # If no output dir, we create a temp dir that persists (unless cleanup requested)
-        # But wait, the user wants paths. If we use tempfile.TemporaryDirectory context,
-        # it disappears. We should use mkdtemp or let stage_case handle it.
-        # Let's use a temp dir for staging.
-        base_temp = Path(tempfile.mkdtemp(prefix="deepisles_pipeline_"))
-        staging_root = base_temp / "staging"
-        results_dir = base_temp / "results"
-    staged = stage_case_for_deepisles(case_files, staging_root)
     # 3. Run Inference
     inference_result = run_deepisles_on_folder(
@@ -122,10 +129,8 @@ def run_pipeline_on_case(
         gpu=gpu,
     )
-    # 4. Compute Metrics
     dice_score: float | None = None
-    ground_truth = case_files.get("ground_truth")
     if compute_dice and ground_truth and ground_truth.exists():
         try:
             dice_score = metrics.compute_dice(inference_result.prediction_path, ground_truth)

     start_time = time.time()
+    # Use context manager to ensure HuggingFace temp files are cleaned up
+    # This prevents unbounded disk usage from accumulating temp NIfTI files
+    with load_isles_dataset() as dataset:
+        # Resolve ID if integer
+        if isinstance(case_id, int):
+            all_ids = dataset.list_case_ids()
+            if case_id < 0 or case_id >= len(all_ids):
+                raise IndexError(f"Case index {case_id} out of range (0-{len(all_ids) - 1})")
+            resolved_case_id = all_ids[case_id]
+        else:
+            resolved_case_id = case_id
+        # Set up output directories (now that we have resolved_case_id)
+        if output_dir:
+            output_dir = Path(output_dir)
+            output_dir.mkdir(parents=True, exist_ok=True)
+            staging_root = output_dir / "staging" / resolved_case_id
+            results_dir = output_dir / resolved_case_id
+        else:
+            base_temp = Path(tempfile.mkdtemp(prefix="deepisles_pipeline_"))
+            staging_root = base_temp / "staging"
+            results_dir = base_temp / "results"
+        # Get case files
+        case_files = dataset.get_case(resolved_case_id)
+        # Stage files (copies DWI/ADC to staging directory)
+        staged = stage_case_for_deepisles(case_files, staging_root)
+        # Copy ground truth to results_dir before dataset cleanup
+        # (HuggingFace mode stores ground truth in temp files that get cleaned up)
+        ground_truth: Path | None = None
+        original_ground_truth = case_files.get("ground_truth")
+        if original_ground_truth and original_ground_truth.exists():
+            results_dir.mkdir(parents=True, exist_ok=True)
+            ground_truth = results_dir / f"{resolved_case_id}_ground_truth.nii.gz"
+            shutil.copy2(original_ground_truth, ground_truth)
+    # Dataset temp files cleaned up here (context manager __exit__)
     # 3. Run Inference
     inference_result = run_deepisles_on_folder(
         gpu=gpu,
     )
+    # 4. Compute Metrics (using copied ground truth)
     dice_score: float | None = None
     if compute_dice and ground_truth and ground_truth.exists():
         try:
             dice_score = metrics.compute_dice(inference_result.prediction_path, ground_truth)

tests/test_pipeline.py CHANGED Viewed

@@ -35,21 +35,25 @@ class TestRunPipelineOnCase:
             # Configure mocks
             mock_dataset = MagicMock()
             # Mock paths that "exist"
             dwi_path = MagicMock(spec=Path)
             dwi_path.exists.return_value = True
             adc_path = MagicMock(spec=Path)
             adc_path.exists.return_value = True
-            gt_path = MagicMock(spec=Path)
-            gt_path.exists.return_value = True
             mock_dataset.get_case.return_value = CaseFiles(
                 dwi=dwi_path,
                 adc=adc_path,
-                ground_truth=gt_path,
                 # flair omitted
             )
-            mock_load.return_value = mock_dataset
             mock_stage.return_value = MagicMock(
                 input_dir=temp_dir / "staged",
@@ -147,7 +151,9 @@ class TestRunPipelineOnCase:
         """Handles cases without ground truth gracefully."""
         # Modify mock to return no ground truth
         dwi = MagicMock(spec=Path)
         adc = MagicMock(spec=Path)
         mock_dependencies["dataset"].get_case.return_value = CaseFiles(
             dwi=dwi,
             adc=adc,

             # Configure mocks
             mock_dataset = MagicMock()
+            # Create real temp files for ground truth (context manager cleans up HF temp files)
+            gt_file = temp_dir / "gt_mock.nii.gz"
+            gt_file.write_bytes(b"fake nifti")
             # Mock paths that "exist"
             dwi_path = MagicMock(spec=Path)
             dwi_path.exists.return_value = True
             adc_path = MagicMock(spec=Path)
             adc_path.exists.return_value = True
             mock_dataset.get_case.return_value = CaseFiles(
                 dwi=dwi_path,
                 adc=adc_path,
+                ground_truth=gt_file,  # Use real file for copy operation
                 # flair omitted
             )
+            # Support context manager protocol: with load_isles_dataset() as dataset:
+            mock_load.return_value.__enter__ = MagicMock(return_value=mock_dataset)
+            mock_load.return_value.__exit__ = MagicMock(return_value=None)
             mock_stage.return_value = MagicMock(
                 input_dir=temp_dir / "staged",
         """Handles cases without ground truth gracefully."""
         # Modify mock to return no ground truth
         dwi = MagicMock(spec=Path)
+        dwi.exists.return_value = True
         adc = MagicMock(spec=Path)
+        adc.exists.return_value = True
         mock_dependencies["dataset"].get_case.return_value = CaseFiles(
             dwi=dwi,
             adc=adc,

tests/test_pipeline_cleanup.py CHANGED Viewed

@@ -17,9 +17,13 @@ def test_pipeline_cleanup_default() -> None:
     ):
         # Setup mocks
         mock_dataset = MagicMock()
-        mock_load.return_value = mock_dataset
         mock_dataset.list_case_ids.return_value = ["case1"]
-        mock_dataset.get_case.return_value = {"dwi": Path("dwi.nii.gz")}
         mock_staged = MagicMock()
         mock_staged.input_dir = Path("/tmp/mock_staging")

     ):
         # Setup mocks
         mock_dataset = MagicMock()
         mock_dataset.list_case_ids.return_value = ["case1"]
+        # Return dict without ground_truth to avoid file copy attempt
+        mock_dataset.get_case.return_value = {"dwi": Path("dwi.nii.gz"), "adc": Path("adc.nii.gz")}
+        # Support context manager protocol: with load_isles_dataset() as dataset:
+        mock_load.return_value.__enter__ = MagicMock(return_value=mock_dataset)
+        mock_load.return_value.__exit__ = MagicMock(return_value=None)
         mock_staged = MagicMock()
         mock_staged.input_dir = Path("/tmp/mock_staging")