raylim Claude Sonnet 4.5 commited on
Commit
42a4892
·
unverified ·
1 Parent(s): 4fda083

Centralize hardware detection and optimize T4 GPU memory management

Browse files

- Create new mosaic.hardware module for centralized GPU detection
- Set T4 concurrency limit to 1 to prevent concurrent OOM errors
- Update analysis.py, model_manager.py, and ui/app.py to use centralized detection
- Simplify cleanup logic: only Paladin models cleared on T4, core models persist
- Remove redundant GPU cleanup code (already handled by model_cache.cleanup())

Fixes T4 memory issues by enforcing sequential processing while maintaining
efficient batch processing on high-memory GPUs.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

src/mosaic/analysis.py CHANGED
@@ -4,57 +4,9 @@ This module provides the main slide analysis pipeline that integrates tissue seg
4
  feature extraction, and model inference for cancer subtype and biomarker prediction.
5
  """
6
 
7
- # Import spaces first before any CUDA-related imports
8
- import os
9
-
10
- try:
11
- import spaces
12
-
13
- HAS_SPACES = True
14
- # Check if we're actually running on ZeroGPU
15
- # HF Spaces sets SPACES_ZERO_GPU=1 when using ZeroGPU
16
- IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU") == "1"
17
- except ImportError:
18
- HAS_SPACES = False
19
- IS_ZEROGPU = False
20
-
21
- # Create a no-op decorator if spaces is not available
22
- class spaces:
23
- @staticmethod
24
- def GPU(fn=None, duration=None):
25
- if fn is None:
26
- return lambda f: f
27
- return fn
28
-
29
-
30
- # Detect T4 hardware by checking actual GPU
31
- import torch
32
-
33
- IS_T4_GPU = False
34
- GPU_NAME = "Unknown"
35
- if not IS_ZEROGPU and torch.cuda.is_available():
36
- try:
37
- GPU_NAME = torch.cuda.get_device_name(0)
38
- IS_T4_GPU = "T4" in GPU_NAME
39
- except:
40
- pass
41
-
42
- # Set optimal parameters based on hardware
43
- if IS_ZEROGPU:
44
- DEFAULT_BATCH_SIZE = 128
45
- DEFAULT_NUM_WORKERS = 0
46
- GPU_TYPE = "ZeroGPU (H100)"
47
- elif IS_T4_GPU:
48
- DEFAULT_BATCH_SIZE = 64
49
- DEFAULT_NUM_WORKERS = 4
50
- GPU_TYPE = f"T4 ({GPU_NAME})"
51
- else:
52
- DEFAULT_BATCH_SIZE = 64
53
- DEFAULT_NUM_WORKERS = 8
54
- GPU_TYPE = f"Standard GPU ({GPU_NAME})"
55
-
56
  import pickle
57
  import gc
 
58
  import pandas as pd
59
  import gradio as gr
60
  from pathlib import Path
@@ -66,9 +18,14 @@ from loguru import logger
66
  from mosaic.inference import run_aeon, run_paladin
67
  from mosaic.data_directory import get_data_directory
68
 
69
- # Log hardware detection at module load
70
- logger.info(
71
- f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}"
 
 
 
 
 
72
  )
73
 
74
 
@@ -441,14 +398,11 @@ def _run_inference_pipeline_impl(
441
  # Clean up models to free GPU memory
442
  logger.info("Cleaning up models after single-slide inference")
443
  model_cache.cleanup()
444
-
445
- # Extra aggressive cleanup for T4 instances
446
- if torch.cuda.is_available():
447
  torch.cuda.synchronize()
448
- torch.cuda.empty_cache()
449
- gc.collect()
450
- mem_allocated = torch.cuda.memory_allocated() / (1024**3)
451
- logger.info(f"GPU memory after cleanup: {mem_allocated:.2f} GB")
452
 
453
 
454
  # ============================================================================
 
4
  feature extraction, and model inference for cancer subtype and biomarker prediction.
5
  """
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import pickle
8
  import gc
9
+ import torch
10
  import pandas as pd
11
  import gradio as gr
12
  from pathlib import Path
 
18
  from mosaic.inference import run_aeon, run_paladin
19
  from mosaic.data_directory import get_data_directory
20
 
21
+ # Import centralized hardware detection
22
+ from mosaic.hardware import (
23
+ spaces,
24
+ IS_ZEROGPU,
25
+ IS_T4_GPU,
26
+ GPU_TYPE,
27
+ DEFAULT_BATCH_SIZE,
28
+ DEFAULT_NUM_WORKERS,
29
  )
30
 
31
 
 
398
  # Clean up models to free GPU memory
399
  logger.info("Cleaning up models after single-slide inference")
400
  model_cache.cleanup()
401
+
402
+ # T4-specific: Ensure GPU operations are complete before next request
403
+ if IS_T4_GPU and torch.cuda.is_available():
404
  torch.cuda.synchronize()
405
+ logger.info("T4: GPU operations synchronized")
 
 
 
406
 
407
 
408
  # ============================================================================
src/mosaic/hardware.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hardware detection and configuration for GPU-specific optimizations.
2
+
3
+ This module centralizes all hardware detection logic to provide consistent
4
+ GPU-specific settings across the Mosaic application.
5
+ """
6
+
7
+ import os
8
+ import torch
9
+ from loguru import logger
10
+
11
+ # Detect HuggingFace Spaces ZeroGPU environment
12
+ try:
13
+ import spaces
14
+
15
+ HAS_SPACES = True
16
+ # Check if we're actually running on ZeroGPU
17
+ # HF Spaces sets SPACES_ZERO_GPU=1 when using ZeroGPU
18
+ IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU") == "1"
19
+ except ImportError:
20
+ HAS_SPACES = False
21
+ IS_ZEROGPU = False
22
+
23
+ # Create a no-op decorator if spaces is not available
24
+ class spaces:
25
+ @staticmethod
26
+ def GPU(fn=None, duration=None):
27
+ if fn is None:
28
+ return lambda f: f
29
+ return fn
30
+
31
+
32
+ # Detect GPU hardware type
33
+ IS_T4_GPU = False
34
+ GPU_NAME = "Unknown"
35
+
36
+ if not IS_ZEROGPU and torch.cuda.is_available():
37
+ try:
38
+ GPU_NAME = torch.cuda.get_device_name(0)
39
+ IS_T4_GPU = "T4" in GPU_NAME
40
+ except Exception:
41
+ pass
42
+
43
+
44
+ # Set optimal parameters based on hardware
45
+ if IS_ZEROGPU:
46
+ DEFAULT_BATCH_SIZE = 128
47
+ DEFAULT_NUM_WORKERS = 0
48
+ DEFAULT_CONCURRENCY_LIMIT = 8 # ZeroGPU manages its own queue
49
+ GPU_TYPE = "ZeroGPU (H100)"
50
+ elif IS_T4_GPU:
51
+ DEFAULT_BATCH_SIZE = 64
52
+ DEFAULT_NUM_WORKERS = 4
53
+ DEFAULT_CONCURRENCY_LIMIT = 1 # T4 can only handle one analysis at a time (16GB memory)
54
+ GPU_TYPE = f"T4 ({GPU_NAME})"
55
+ else:
56
+ DEFAULT_BATCH_SIZE = 64
57
+ DEFAULT_NUM_WORKERS = 8
58
+ DEFAULT_CONCURRENCY_LIMIT = 8 # High-memory GPUs can handle multiple analyses
59
+ GPU_TYPE = f"Standard GPU ({GPU_NAME})"
60
+
61
+
62
+ # Log hardware detection at module load
63
+ logger.info(
64
+ f"Hardware: {GPU_TYPE} | "
65
+ f"batch_size={DEFAULT_BATCH_SIZE}, "
66
+ f"num_workers={DEFAULT_NUM_WORKERS}, "
67
+ f"concurrency_limit={DEFAULT_CONCURRENCY_LIMIT}"
68
+ )
69
+
70
+
71
+ # Export commonly used symbols
72
+ __all__ = [
73
+ "spaces",
74
+ "HAS_SPACES",
75
+ "IS_ZEROGPU",
76
+ "IS_T4_GPU",
77
+ "GPU_NAME",
78
+ "GPU_TYPE",
79
+ "DEFAULT_BATCH_SIZE",
80
+ "DEFAULT_NUM_WORKERS",
81
+ "DEFAULT_CONCURRENCY_LIMIT",
82
+ ]
src/mosaic/model_manager.py CHANGED
@@ -13,6 +13,7 @@ import torch
13
  from loguru import logger
14
 
15
  from mosaic.data_directory import get_data_directory
 
16
  from mussel.models import ModelType, get_model_factory
17
 
18
 
@@ -129,28 +130,25 @@ def load_all_models(
129
  logger.info("BATCH PROCESSING: Loading models (this happens ONCE per batch)")
130
  logger.info("=" * 80)
131
 
132
- # Detect GPU type
133
  device = torch.device("cpu")
134
- is_t4_gpu = False
135
 
136
  if use_gpu and torch.cuda.is_available():
137
  device = torch.device("cuda")
138
- gpu_name = torch.cuda.get_device_name(0)
139
- is_t4_gpu = "T4" in gpu_name
140
  gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
141
- logger.info(f"GPU detected: {gpu_name}")
142
  logger.info(f"GPU total memory: {gpu_memory_total:.2f} GB")
143
 
144
  # Log initial GPU memory
145
  mem_before = torch.cuda.memory_allocated() / (1024**3)
146
  logger.info(f"GPU memory before loading models: {mem_before:.2f} GB")
147
 
148
- # Auto-detect memory management strategy
149
  if aggressive_memory_mgmt is None:
150
- aggressive_memory_mgmt = is_t4_gpu
151
- strategy = "AGGRESSIVE (T4)" if is_t4_gpu else "CACHING (High-Memory GPU)"
152
  logger.info(f"Memory management strategy: {strategy}")
153
- if is_t4_gpu:
154
  logger.info(" → Paladin models will be loaded and freed per slide")
155
  else:
156
  logger.info(
@@ -247,7 +245,7 @@ def load_all_models(
247
  optimus_model=optimus_model,
248
  marker_classifier=marker_classifier,
249
  aeon_model=aeon_model,
250
- is_t4_gpu=is_t4_gpu,
251
  aggressive_memory_mgmt=aggressive_memory_mgmt,
252
  device=device,
253
  )
 
13
  from loguru import logger
14
 
15
  from mosaic.data_directory import get_data_directory
16
+ from mosaic.hardware import IS_T4_GPU, GPU_NAME
17
  from mussel.models import ModelType, get_model_factory
18
 
19
 
 
130
  logger.info("BATCH PROCESSING: Loading models (this happens ONCE per batch)")
131
  logger.info("=" * 80)
132
 
133
+ # Use centralized GPU detection
134
  device = torch.device("cpu")
 
135
 
136
  if use_gpu and torch.cuda.is_available():
137
  device = torch.device("cuda")
 
 
138
  gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
139
+ logger.info(f"GPU detected: {GPU_NAME}")
140
  logger.info(f"GPU total memory: {gpu_memory_total:.2f} GB")
141
 
142
  # Log initial GPU memory
143
  mem_before = torch.cuda.memory_allocated() / (1024**3)
144
  logger.info(f"GPU memory before loading models: {mem_before:.2f} GB")
145
 
146
+ # Auto-detect memory management strategy based on centralized hardware detection
147
  if aggressive_memory_mgmt is None:
148
+ aggressive_memory_mgmt = IS_T4_GPU
149
+ strategy = "AGGRESSIVE (T4)" if IS_T4_GPU else "CACHING (High-Memory GPU)"
150
  logger.info(f"Memory management strategy: {strategy}")
151
+ if IS_T4_GPU:
152
  logger.info(" → Paladin models will be loaded and freed per slide")
153
  else:
154
  logger.info(
 
245
  optimus_model=optimus_model,
246
  marker_classifier=marker_classifier,
247
  aeon_model=aeon_model,
248
+ is_t4_gpu=IS_T4_GPU,
249
  aggressive_memory_mgmt=aggressive_memory_mgmt,
250
  device=device,
251
  )
src/mosaic/ui/app.py CHANGED
@@ -25,6 +25,7 @@ from mosaic.ui.utils import (
25
  )
26
  from mosaic.analysis import analyze_slide
27
  from mosaic.model_manager import load_all_models
 
28
 
29
  current_dir = Path(__file__).parent.parent
30
 
@@ -271,32 +272,22 @@ def analyze_slides(
271
  # Final yield with complete results
272
  # Hide settings table if only one slide, keep visible for multiple slides
273
  settings_visible = len(slides) > 1
274
-
275
  # Store final results before cleanup
276
  final_slide_masks = all_slide_masks
277
  final_combined_paladin = combined_paladin_results if len(combined_paladin_results) > 0 else None
278
-
279
- # Aggressive memory cleanup after storing final results
280
  import gc
281
- import torch
282
-
283
- # Clear intermediate data structures
284
  all_slide_masks = None
285
  all_aeon_results = None
286
  all_paladin_results = None
287
  combined_paladin_results = None
288
-
289
- # Force garbage collection
290
  gc.collect()
291
 
292
- # Clear GPU cache if available
293
- if torch.cuda.is_available():
294
- torch.cuda.synchronize()
295
- torch.cuda.empty_cache()
296
- mem_allocated = torch.cuda.memory_allocated() / (1024**3)
297
- mem_reserved = torch.cuda.memory_reserved() / (1024**3)
298
- logger.info(f"GPU memory after final cleanup: {mem_allocated:.2f} GB allocated, {mem_reserved:.2f} GB reserved")
299
-
300
  yield (
301
  gr.Dataframe(value=settings_input, visible=settings_visible), # Hide if single slide
302
  final_slide_masks,
@@ -626,7 +617,10 @@ def launch_gradio(server_name, server_port, share):
626
  outputs=[user_dir_state],
627
  )
628
 
629
- demo.queue(max_size=10, default_concurrency_limit=8)
 
 
 
630
  demo.launch(
631
  server_name=server_name,
632
  share=share,
 
25
  )
26
  from mosaic.analysis import analyze_slide
27
  from mosaic.model_manager import load_all_models
28
+ from mosaic.hardware import DEFAULT_CONCURRENCY_LIMIT, IS_T4_GPU
29
 
30
  current_dir = Path(__file__).parent.parent
31
 
 
272
  # Final yield with complete results
273
  # Hide settings table if only one slide, keep visible for multiple slides
274
  settings_visible = len(slides) > 1
275
+
276
  # Store final results before cleanup
277
  final_slide_masks = all_slide_masks
278
  final_combined_paladin = combined_paladin_results if len(combined_paladin_results) > 0 else None
279
+
280
+ # Memory cleanup: Clear intermediate data structures from RAM
281
  import gc
282
+
 
 
283
  all_slide_masks = None
284
  all_aeon_results = None
285
  all_paladin_results = None
286
  combined_paladin_results = None
287
+
288
+ # Force garbage collection to free Python memory
289
  gc.collect()
290
 
 
 
 
 
 
 
 
 
291
  yield (
292
  gr.Dataframe(value=settings_input, visible=settings_visible), # Hide if single slide
293
  final_slide_masks,
 
617
  outputs=[user_dir_state],
618
  )
619
 
620
+ # Use hardware-specific concurrency limit
621
+ # T4 GPUs (16GB) can only handle one analysis at a time to prevent OOM
622
+ # Higher-memory GPUs and ZeroGPU can handle multiple concurrent analyses
623
+ demo.queue(max_size=10, default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT)
624
  demo.launch(
625
  server_name=server_name,
626
  share=share,