raylim Claude Sonnet 4.5 commited on
Commit
a2d70a9
·
unverified ·
1 Parent(s): 42a4892

Keep core models loaded across requests on T4

Browse files

On T4 with concurrency_limit=1, models are now loaded once globally
and persist across all sequential requests. Only Paladin models are
cleaned up after each request, while core models (CTransPath, Optimus,
Aeon, marker_classifier) remain in GPU memory.

This eliminates the overhead of reloading 8-12GB of models for every
single-slide request on T4 instances.

High-memory GPUs continue to use the previous per-batch loading strategy.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/mosaic/ui/app.py +24 -3
src/mosaic/ui/app.py CHANGED
@@ -34,6 +34,9 @@ cancer_subtype_name_map = {}
34
  reversed_cancer_subtype_name_map = {}
35
  cancer_subtypes = []
36
 
 
 
 
37
 
38
  def set_cancer_subtype_maps(csn_map, rcsn_map, cs):
39
  """Set the global cancer subtype maps."""
@@ -117,8 +120,21 @@ def analyze_slides(
117
  )
118
 
119
  # Load models once (for batch) or per-slide (for single)
 
 
 
120
  model_cache = None
121
- if len(slides) > 1:
 
 
 
 
 
 
 
 
 
 
122
  logger.info(f"Batch mode: Loading models once for {len(slides)} slides")
123
  progress(0.0, desc=f"Loading models for batch processing")
124
  model_cache = load_all_models(use_gpu=True, aggressive_memory_mgmt=None)
@@ -215,9 +231,14 @@ def analyze_slides(
215
 
216
  finally:
217
  # Clean up model cache if it was loaded for batch processing
218
- if model_cache is not None:
219
- logger.info("Cleaning up model cache")
 
 
220
  model_cache.cleanup()
 
 
 
221
 
222
  progress(0.99, desc="Analysis complete, wrapping up results")
223
 
 
34
  reversed_cancer_subtype_name_map = {}
35
  cancer_subtypes = []
36
 
37
+ # Global model cache for T4 (to persist models across sequential requests)
38
+ _global_model_cache = None
39
+
40
 
41
  def set_cancer_subtype_maps(csn_map, rcsn_map, cs):
42
  """Set the global cancer subtype maps."""
 
120
  )
121
 
122
  # Load models once (for batch) or per-slide (for single)
123
+ # On T4: Keep models loaded globally across all requests (concurrency=1 ensures no conflicts)
124
+ # On high-memory GPUs: Load models per-batch, reload for single slides
125
+ global _global_model_cache
126
  model_cache = None
127
+
128
+ if IS_T4_GPU:
129
+ # T4: Use global cache to keep models loaded across requests
130
+ if _global_model_cache is None:
131
+ logger.info("T4: Loading models once (will persist across all requests)")
132
+ progress(0.0, desc="Loading models (one-time initialization)")
133
+ _global_model_cache = load_all_models(use_gpu=True, aggressive_memory_mgmt=None)
134
+ else:
135
+ logger.info(f"T4: Reusing pre-loaded models from global cache")
136
+ model_cache = _global_model_cache
137
+ elif len(slides) > 1:
138
  logger.info(f"Batch mode: Loading models once for {len(slides)} slides")
139
  progress(0.0, desc=f"Loading models for batch processing")
140
  model_cache = load_all_models(use_gpu=True, aggressive_memory_mgmt=None)
 
231
 
232
  finally:
233
  # Clean up model cache if it was loaded for batch processing
234
+ # On T4: Keep global cache loaded, only cleanup Paladin models
235
+ # On high-memory GPUs: Cleanup everything after batch
236
+ if model_cache is not None and not IS_T4_GPU:
237
+ logger.info("Cleaning up model cache after batch")
238
  model_cache.cleanup()
239
+ elif IS_T4_GPU and model_cache is not None:
240
+ logger.info("T4: Keeping core models loaded, cleaning up Paladin models only")
241
+ model_cache.cleanup_paladin()
242
 
243
  progress(0.99, desc="Analysis complete, wrapping up results")
244