Spaces:
Running
on
Zero
Running
on
Zero
Keep core models loaded across requests on T4
Browse filesOn T4 with concurrency_limit=1, models are now loaded once globally
and persist across all sequential requests. Only Paladin models are
cleaned up after each request, while core models (CTransPath, Optimus,
Aeon, marker_classifier) remain in GPU memory.
This eliminates the overhead of reloading 8-12GB of models for every
single-slide request on T4 instances.
High-memory GPUs continue to use the previous per-batch loading strategy.
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- src/mosaic/ui/app.py +24 -3
src/mosaic/ui/app.py
CHANGED
|
@@ -34,6 +34,9 @@ cancer_subtype_name_map = {}
|
|
| 34 |
reversed_cancer_subtype_name_map = {}
|
| 35 |
cancer_subtypes = []
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
def set_cancer_subtype_maps(csn_map, rcsn_map, cs):
|
| 39 |
"""Set the global cancer subtype maps."""
|
|
@@ -117,8 +120,21 @@ def analyze_slides(
|
|
| 117 |
)
|
| 118 |
|
| 119 |
# Load models once (for batch) or per-slide (for single)
|
|
|
|
|
|
|
|
|
|
| 120 |
model_cache = None
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
logger.info(f"Batch mode: Loading models once for {len(slides)} slides")
|
| 123 |
progress(0.0, desc=f"Loading models for batch processing")
|
| 124 |
model_cache = load_all_models(use_gpu=True, aggressive_memory_mgmt=None)
|
|
@@ -215,9 +231,14 @@ def analyze_slides(
|
|
| 215 |
|
| 216 |
finally:
|
| 217 |
# Clean up model cache if it was loaded for batch processing
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
| 220 |
model_cache.cleanup()
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
progress(0.99, desc="Analysis complete, wrapping up results")
|
| 223 |
|
|
|
|
| 34 |
reversed_cancer_subtype_name_map = {}
|
| 35 |
cancer_subtypes = []
|
| 36 |
|
| 37 |
+
# Global model cache for T4 (to persist models across sequential requests)
|
| 38 |
+
_global_model_cache = None
|
| 39 |
+
|
| 40 |
|
| 41 |
def set_cancer_subtype_maps(csn_map, rcsn_map, cs):
|
| 42 |
"""Set the global cancer subtype maps."""
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Load models once (for batch) or per-slide (for single)
|
| 123 |
+
# On T4: Keep models loaded globally across all requests (concurrency=1 ensures no conflicts)
|
| 124 |
+
# On high-memory GPUs: Load models per-batch, reload for single slides
|
| 125 |
+
global _global_model_cache
|
| 126 |
model_cache = None
|
| 127 |
+
|
| 128 |
+
if IS_T4_GPU:
|
| 129 |
+
# T4: Use global cache to keep models loaded across requests
|
| 130 |
+
if _global_model_cache is None:
|
| 131 |
+
logger.info("T4: Loading models once (will persist across all requests)")
|
| 132 |
+
progress(0.0, desc="Loading models (one-time initialization)")
|
| 133 |
+
_global_model_cache = load_all_models(use_gpu=True, aggressive_memory_mgmt=None)
|
| 134 |
+
else:
|
| 135 |
+
logger.info(f"T4: Reusing pre-loaded models from global cache")
|
| 136 |
+
model_cache = _global_model_cache
|
| 137 |
+
elif len(slides) > 1:
|
| 138 |
logger.info(f"Batch mode: Loading models once for {len(slides)} slides")
|
| 139 |
progress(0.0, desc=f"Loading models for batch processing")
|
| 140 |
model_cache = load_all_models(use_gpu=True, aggressive_memory_mgmt=None)
|
|
|
|
| 231 |
|
| 232 |
finally:
|
| 233 |
# Clean up model cache if it was loaded for batch processing
|
| 234 |
+
# On T4: Keep global cache loaded, only cleanup Paladin models
|
| 235 |
+
# On high-memory GPUs: Cleanup everything after batch
|
| 236 |
+
if model_cache is not None and not IS_T4_GPU:
|
| 237 |
+
logger.info("Cleaning up model cache after batch")
|
| 238 |
model_cache.cleanup()
|
| 239 |
+
elif IS_T4_GPU and model_cache is not None:
|
| 240 |
+
logger.info("T4: Keeping core models loaded, cleaning up Paladin models only")
|
| 241 |
+
model_cache.cleanup_paladin()
|
| 242 |
|
| 243 |
progress(0.99, desc="Analysis complete, wrapping up results")
|
| 244 |
|