VibecoderMcSwaggins commited on
Commit
878d2e7
·
1 Parent(s): 363ba14

fix(pipeline): prevent unbounded disk usage from HuggingFace temp files

Browse files

CRITICAL FIX for HuggingFace Spaces deployment.

Problem:
run_pipeline_on_case() created a HuggingFaceDataset instance that wrote
NIfTI files (~50-100MB) to temp directories on each call. These temp
files were never cleaned up because the dataset's cleanup() method
was never called. This would rapidly fill disk quota on HF Spaces.

Solution:
- Use context manager for dataset: `with load_isles_dataset() as dataset:`
- Copy ground_truth to results_dir before dataset cleanup (needed for
Dice computation after context exit)
- Dataset temp files are now automatically cleaned up via __exit__

Note: PipelineResult.input_files paths may be invalid after function
returns in HuggingFace mode (temp files deleted). This is acceptable
since the important outputs (prediction_mask, ground_truth copy) are
in results_dir which is preserved.

Test updates:
- Updated mocks to support context manager protocol
- Use real temp file for ground_truth in tests that need file copy

All 125 tests pass, ruff and mypy clean.

src/stroke_deepisles_demo/pipeline.py CHANGED
@@ -81,38 +81,45 @@ def run_pipeline_on_case(
81
 
82
  start_time = time.time()
83
 
84
- # 1. Load Dataset
85
- dataset = load_isles_dataset() # Uses default local path for now
86
-
87
- # Resolve ID if integer
88
- if isinstance(case_id, int):
89
- all_ids = dataset.list_case_ids()
90
- if case_id < 0 or case_id >= len(all_ids):
91
- raise IndexError(f"Case index {case_id} out of range (0-{len(all_ids) - 1})")
92
- resolved_case_id = all_ids[case_id]
93
- else:
94
- resolved_case_id = case_id
95
-
96
- # Get case files
97
- case_files = dataset.get_case(resolved_case_id)
98
-
99
- # 2. Stage Files
100
- # Use a temp dir for staging if output_dir not provided, or a subdir of output_dir
101
- if output_dir:
102
- output_dir = Path(output_dir)
103
- output_dir.mkdir(parents=True, exist_ok=True)
104
- staging_root = output_dir / "staging" / resolved_case_id
105
- results_dir = output_dir / resolved_case_id
106
- else:
107
- # If no output dir, we create a temp dir that persists (unless cleanup requested)
108
- # But wait, the user wants paths. If we use tempfile.TemporaryDirectory context,
109
- # it disappears. We should use mkdtemp or let stage_case handle it.
110
- # Let's use a temp dir for staging.
111
- base_temp = Path(tempfile.mkdtemp(prefix="deepisles_pipeline_"))
112
- staging_root = base_temp / "staging"
113
- results_dir = base_temp / "results"
114
-
115
- staged = stage_case_for_deepisles(case_files, staging_root)
 
 
 
 
 
 
 
116
 
117
  # 3. Run Inference
118
  inference_result = run_deepisles_on_folder(
@@ -122,10 +129,8 @@ def run_pipeline_on_case(
122
  gpu=gpu,
123
  )
124
 
125
- # 4. Compute Metrics
126
  dice_score: float | None = None
127
- ground_truth = case_files.get("ground_truth")
128
-
129
  if compute_dice and ground_truth and ground_truth.exists():
130
  try:
131
  dice_score = metrics.compute_dice(inference_result.prediction_path, ground_truth)
 
81
 
82
  start_time = time.time()
83
 
84
+ # Use context manager to ensure HuggingFace temp files are cleaned up
85
+ # This prevents unbounded disk usage from accumulating temp NIfTI files
86
+ with load_isles_dataset() as dataset:
87
+ # Resolve ID if integer
88
+ if isinstance(case_id, int):
89
+ all_ids = dataset.list_case_ids()
90
+ if case_id < 0 or case_id >= len(all_ids):
91
+ raise IndexError(f"Case index {case_id} out of range (0-{len(all_ids) - 1})")
92
+ resolved_case_id = all_ids[case_id]
93
+ else:
94
+ resolved_case_id = case_id
95
+
96
+ # Set up output directories (now that we have resolved_case_id)
97
+ if output_dir:
98
+ output_dir = Path(output_dir)
99
+ output_dir.mkdir(parents=True, exist_ok=True)
100
+ staging_root = output_dir / "staging" / resolved_case_id
101
+ results_dir = output_dir / resolved_case_id
102
+ else:
103
+ base_temp = Path(tempfile.mkdtemp(prefix="deepisles_pipeline_"))
104
+ staging_root = base_temp / "staging"
105
+ results_dir = base_temp / "results"
106
+
107
+ # Get case files
108
+ case_files = dataset.get_case(resolved_case_id)
109
+
110
+ # Stage files (copies DWI/ADC to staging directory)
111
+ staged = stage_case_for_deepisles(case_files, staging_root)
112
+
113
+ # Copy ground truth to results_dir before dataset cleanup
114
+ # (HuggingFace mode stores ground truth in temp files that get cleaned up)
115
+ ground_truth: Path | None = None
116
+ original_ground_truth = case_files.get("ground_truth")
117
+ if original_ground_truth and original_ground_truth.exists():
118
+ results_dir.mkdir(parents=True, exist_ok=True)
119
+ ground_truth = results_dir / f"{resolved_case_id}_ground_truth.nii.gz"
120
+ shutil.copy2(original_ground_truth, ground_truth)
121
+
122
+ # Dataset temp files cleaned up here (context manager __exit__)
123
 
124
  # 3. Run Inference
125
  inference_result = run_deepisles_on_folder(
 
129
  gpu=gpu,
130
  )
131
 
132
+ # 4. Compute Metrics (using copied ground truth)
133
  dice_score: float | None = None
 
 
134
  if compute_dice and ground_truth and ground_truth.exists():
135
  try:
136
  dice_score = metrics.compute_dice(inference_result.prediction_path, ground_truth)
tests/test_pipeline.py CHANGED
@@ -35,21 +35,25 @@ class TestRunPipelineOnCase:
35
  # Configure mocks
36
  mock_dataset = MagicMock()
37
 
 
 
 
 
38
  # Mock paths that "exist"
39
  dwi_path = MagicMock(spec=Path)
40
  dwi_path.exists.return_value = True
41
  adc_path = MagicMock(spec=Path)
42
  adc_path.exists.return_value = True
43
- gt_path = MagicMock(spec=Path)
44
- gt_path.exists.return_value = True
45
 
46
  mock_dataset.get_case.return_value = CaseFiles(
47
  dwi=dwi_path,
48
  adc=adc_path,
49
- ground_truth=gt_path,
50
  # flair omitted
51
  )
52
- mock_load.return_value = mock_dataset
 
 
53
 
54
  mock_stage.return_value = MagicMock(
55
  input_dir=temp_dir / "staged",
@@ -147,7 +151,9 @@ class TestRunPipelineOnCase:
147
  """Handles cases without ground truth gracefully."""
148
  # Modify mock to return no ground truth
149
  dwi = MagicMock(spec=Path)
 
150
  adc = MagicMock(spec=Path)
 
151
  mock_dependencies["dataset"].get_case.return_value = CaseFiles(
152
  dwi=dwi,
153
  adc=adc,
 
35
  # Configure mocks
36
  mock_dataset = MagicMock()
37
 
38
+ # Create real temp files for ground truth (context manager cleans up HF temp files)
39
+ gt_file = temp_dir / "gt_mock.nii.gz"
40
+ gt_file.write_bytes(b"fake nifti")
41
+
42
  # Mock paths that "exist"
43
  dwi_path = MagicMock(spec=Path)
44
  dwi_path.exists.return_value = True
45
  adc_path = MagicMock(spec=Path)
46
  adc_path.exists.return_value = True
 
 
47
 
48
  mock_dataset.get_case.return_value = CaseFiles(
49
  dwi=dwi_path,
50
  adc=adc_path,
51
+ ground_truth=gt_file, # Use real file for copy operation
52
  # flair omitted
53
  )
54
+ # Support context manager protocol: with load_isles_dataset() as dataset:
55
+ mock_load.return_value.__enter__ = MagicMock(return_value=mock_dataset)
56
+ mock_load.return_value.__exit__ = MagicMock(return_value=None)
57
 
58
  mock_stage.return_value = MagicMock(
59
  input_dir=temp_dir / "staged",
 
151
  """Handles cases without ground truth gracefully."""
152
  # Modify mock to return no ground truth
153
  dwi = MagicMock(spec=Path)
154
+ dwi.exists.return_value = True
155
  adc = MagicMock(spec=Path)
156
+ adc.exists.return_value = True
157
  mock_dependencies["dataset"].get_case.return_value = CaseFiles(
158
  dwi=dwi,
159
  adc=adc,
tests/test_pipeline_cleanup.py CHANGED
@@ -17,9 +17,13 @@ def test_pipeline_cleanup_default() -> None:
17
  ):
18
  # Setup mocks
19
  mock_dataset = MagicMock()
20
- mock_load.return_value = mock_dataset
21
  mock_dataset.list_case_ids.return_value = ["case1"]
22
- mock_dataset.get_case.return_value = {"dwi": Path("dwi.nii.gz")}
 
 
 
 
 
23
 
24
  mock_staged = MagicMock()
25
  mock_staged.input_dir = Path("/tmp/mock_staging")
 
17
  ):
18
  # Setup mocks
19
  mock_dataset = MagicMock()
 
20
  mock_dataset.list_case_ids.return_value = ["case1"]
21
+ # Return dict without ground_truth to avoid file copy attempt
22
+ mock_dataset.get_case.return_value = {"dwi": Path("dwi.nii.gz"), "adc": Path("adc.nii.gz")}
23
+
24
+ # Support context manager protocol: with load_isles_dataset() as dataset:
25
+ mock_load.return_value.__enter__ = MagicMock(return_value=mock_dataset)
26
+ mock_load.return_value.__exit__ = MagicMock(return_value=None)
27
 
28
  mock_staged = MagicMock()
29
  mock_staged.input_dir = Path("/tmp/mock_staging")