Spaces:

raylim
/

mosaic-zero

Running on Zero

App Files Files Community

raylim Claude Sonnet 4.5 commited on Jan 13

Commit

42a4892

unverified ·

1 Parent(s): 4fda083

Centralize hardware detection and optimize T4 GPU memory management

Browse files

- Create new mosaic.hardware module for centralized GPU detection
- Set T4 concurrency limit to 1 to prevent concurrent OOM errors
- Update analysis.py, model_manager.py, and ui/app.py to use centralized detection
- Simplify cleanup logic: only Paladin models cleared on T4, core models persist
- Remove redundant GPU cleanup code (already handled by model_cache.cleanup())

Fixes T4 memory issues by enforcing sequential processing while maintaining
efficient batch processing on high-memory GPUs.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (4) hide show

src/mosaic/analysis.py +13 -59
src/mosaic/hardware.py +82 -0
src/mosaic/model_manager.py +8 -10
src/mosaic/ui/app.py +11 -17

src/mosaic/analysis.py CHANGED Viewed

@@ -4,57 +4,9 @@ This module provides the main slide analysis pipeline that integrates tissue seg
 feature extraction, and model inference for cancer subtype and biomarker prediction.
 """
-# Import spaces first before any CUDA-related imports
-import os
-try:
-    import spaces
-    HAS_SPACES = True
-    # Check if we're actually running on ZeroGPU
-    # HF Spaces sets SPACES_ZERO_GPU=1 when using ZeroGPU
-    IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU") == "1"
-except ImportError:
-    HAS_SPACES = False
-    IS_ZEROGPU = False
-    # Create a no-op decorator if spaces is not available
-    class spaces:
-        @staticmethod
-        def GPU(fn=None, duration=None):
-            if fn is None:
-                return lambda f: f
-            return fn
-# Detect T4 hardware by checking actual GPU
-import torch
-IS_T4_GPU = False
-GPU_NAME = "Unknown"
-if not IS_ZEROGPU and torch.cuda.is_available():
-    try:
-        GPU_NAME = torch.cuda.get_device_name(0)
-        IS_T4_GPU = "T4" in GPU_NAME
-    except:
-        pass
-# Set optimal parameters based on hardware
-if IS_ZEROGPU:
-    DEFAULT_BATCH_SIZE = 128
-    DEFAULT_NUM_WORKERS = 0
-    GPU_TYPE = "ZeroGPU (H100)"
-elif IS_T4_GPU:
-    DEFAULT_BATCH_SIZE = 64
-    DEFAULT_NUM_WORKERS = 4
-    GPU_TYPE = f"T4 ({GPU_NAME})"
-else:
-    DEFAULT_BATCH_SIZE = 64
-    DEFAULT_NUM_WORKERS = 8
-    GPU_TYPE = f"Standard GPU ({GPU_NAME})"
 import pickle
 import gc
 import pandas as pd
 import gradio as gr
 from pathlib import Path
@@ -66,9 +18,14 @@ from loguru import logger
 from mosaic.inference import run_aeon, run_paladin
 from mosaic.data_directory import get_data_directory
-# Log hardware detection at module load
-logger.info(
-    f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}"
 )
@@ -441,14 +398,11 @@ def _run_inference_pipeline_impl(
         # Clean up models to free GPU memory
         logger.info("Cleaning up models after single-slide inference")
         model_cache.cleanup()
-        # Extra aggressive cleanup for T4 instances
-        if torch.cuda.is_available():
             torch.cuda.synchronize()
-            torch.cuda.empty_cache()
-            gc.collect()
-            mem_allocated = torch.cuda.memory_allocated() / (1024**3)
-            logger.info(f"GPU memory after cleanup: {mem_allocated:.2f} GB")
 # ============================================================================

 feature extraction, and model inference for cancer subtype and biomarker prediction.
 """
 import pickle
 import gc
+import torch
 import pandas as pd
 import gradio as gr
 from pathlib import Path
 from mosaic.inference import run_aeon, run_paladin
 from mosaic.data_directory import get_data_directory
+# Import centralized hardware detection
+from mosaic.hardware import (
+    spaces,
+    IS_ZEROGPU,
+    IS_T4_GPU,
+    GPU_TYPE,
+    DEFAULT_BATCH_SIZE,
+    DEFAULT_NUM_WORKERS,
 )
         # Clean up models to free GPU memory
         logger.info("Cleaning up models after single-slide inference")
         model_cache.cleanup()
+        # T4-specific: Ensure GPU operations are complete before next request
+        if IS_T4_GPU and torch.cuda.is_available():
             torch.cuda.synchronize()
+            logger.info("T4: GPU operations synchronized")
 # ============================================================================

src/mosaic/hardware.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Hardware detection and configuration for GPU-specific optimizations.
+This module centralizes all hardware detection logic to provide consistent
+GPU-specific settings across the Mosaic application.
+"""
+import os
+import torch
+from loguru import logger
+# Detect HuggingFace Spaces ZeroGPU environment
+try:
+    import spaces
+    HAS_SPACES = True
+    # Check if we're actually running on ZeroGPU
+    # HF Spaces sets SPACES_ZERO_GPU=1 when using ZeroGPU
+    IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU") == "1"
+except ImportError:
+    HAS_SPACES = False
+    IS_ZEROGPU = False
+    # Create a no-op decorator if spaces is not available
+    class spaces:
+        @staticmethod
+        def GPU(fn=None, duration=None):
+            if fn is None:
+                return lambda f: f
+            return fn
+# Detect GPU hardware type
+IS_T4_GPU = False
+GPU_NAME = "Unknown"
+if not IS_ZEROGPU and torch.cuda.is_available():
+    try:
+        GPU_NAME = torch.cuda.get_device_name(0)
+        IS_T4_GPU = "T4" in GPU_NAME
+    except Exception:
+        pass
+# Set optimal parameters based on hardware
+if IS_ZEROGPU:
+    DEFAULT_BATCH_SIZE = 128
+    DEFAULT_NUM_WORKERS = 0
+    DEFAULT_CONCURRENCY_LIMIT = 8  # ZeroGPU manages its own queue
+    GPU_TYPE = "ZeroGPU (H100)"
+elif IS_T4_GPU:
+    DEFAULT_BATCH_SIZE = 64
+    DEFAULT_NUM_WORKERS = 4
+    DEFAULT_CONCURRENCY_LIMIT = 1  # T4 can only handle one analysis at a time (16GB memory)
+    GPU_TYPE = f"T4 ({GPU_NAME})"
+else:
+    DEFAULT_BATCH_SIZE = 64
+    DEFAULT_NUM_WORKERS = 8
+    DEFAULT_CONCURRENCY_LIMIT = 8  # High-memory GPUs can handle multiple analyses
+    GPU_TYPE = f"Standard GPU ({GPU_NAME})"
+# Log hardware detection at module load
+logger.info(
+    f"Hardware: {GPU_TYPE} | "
+    f"batch_size={DEFAULT_BATCH_SIZE}, "
+    f"num_workers={DEFAULT_NUM_WORKERS}, "
+    f"concurrency_limit={DEFAULT_CONCURRENCY_LIMIT}"
+)
+# Export commonly used symbols
+__all__ = [
+    "spaces",
+    "HAS_SPACES",
+    "IS_ZEROGPU",
+    "IS_T4_GPU",
+    "GPU_NAME",
+    "GPU_TYPE",
+    "DEFAULT_BATCH_SIZE",
+    "DEFAULT_NUM_WORKERS",
+    "DEFAULT_CONCURRENCY_LIMIT",
+]

src/mosaic/model_manager.py CHANGED Viewed

@@ -13,6 +13,7 @@ import torch
 from loguru import logger
 from mosaic.data_directory import get_data_directory
 from mussel.models import ModelType, get_model_factory
@@ -129,28 +130,25 @@ def load_all_models(
     logger.info("BATCH PROCESSING: Loading models (this happens ONCE per batch)")
     logger.info("=" * 80)
-    # Detect GPU type
     device = torch.device("cpu")
-    is_t4_gpu = False
     if use_gpu and torch.cuda.is_available():
         device = torch.device("cuda")
-        gpu_name = torch.cuda.get_device_name(0)
-        is_t4_gpu = "T4" in gpu_name
         gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
-        logger.info(f"GPU detected: {gpu_name}")
         logger.info(f"GPU total memory: {gpu_memory_total:.2f} GB")
         # Log initial GPU memory
         mem_before = torch.cuda.memory_allocated() / (1024**3)
         logger.info(f"GPU memory before loading models: {mem_before:.2f} GB")
-        # Auto-detect memory management strategy
         if aggressive_memory_mgmt is None:
-            aggressive_memory_mgmt = is_t4_gpu
-            strategy = "AGGRESSIVE (T4)" if is_t4_gpu else "CACHING (High-Memory GPU)"
             logger.info(f"Memory management strategy: {strategy}")
-            if is_t4_gpu:
                 logger.info("  → Paladin models will be loaded and freed per slide")
             else:
                 logger.info(
@@ -247,7 +245,7 @@ def load_all_models(
         optimus_model=optimus_model,
         marker_classifier=marker_classifier,
         aeon_model=aeon_model,
-        is_t4_gpu=is_t4_gpu,
         aggressive_memory_mgmt=aggressive_memory_mgmt,
         device=device,
     )

 from loguru import logger
 from mosaic.data_directory import get_data_directory
+from mosaic.hardware import IS_T4_GPU, GPU_NAME
 from mussel.models import ModelType, get_model_factory
     logger.info("BATCH PROCESSING: Loading models (this happens ONCE per batch)")
     logger.info("=" * 80)
+    # Use centralized GPU detection
     device = torch.device("cpu")
     if use_gpu and torch.cuda.is_available():
         device = torch.device("cuda")
         gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        logger.info(f"GPU detected: {GPU_NAME}")
         logger.info(f"GPU total memory: {gpu_memory_total:.2f} GB")
         # Log initial GPU memory
         mem_before = torch.cuda.memory_allocated() / (1024**3)
         logger.info(f"GPU memory before loading models: {mem_before:.2f} GB")
+        # Auto-detect memory management strategy based on centralized hardware detection
         if aggressive_memory_mgmt is None:
+            aggressive_memory_mgmt = IS_T4_GPU
+            strategy = "AGGRESSIVE (T4)" if IS_T4_GPU else "CACHING (High-Memory GPU)"
             logger.info(f"Memory management strategy: {strategy}")
+            if IS_T4_GPU:
                 logger.info("  → Paladin models will be loaded and freed per slide")
             else:
                 logger.info(
         optimus_model=optimus_model,
         marker_classifier=marker_classifier,
         aeon_model=aeon_model,
+        is_t4_gpu=IS_T4_GPU,
         aggressive_memory_mgmt=aggressive_memory_mgmt,
         device=device,
     )

src/mosaic/ui/app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from mosaic.ui.utils import (
 )
 from mosaic.analysis import analyze_slide
 from mosaic.model_manager import load_all_models
 current_dir = Path(__file__).parent.parent
@@ -271,32 +272,22 @@ def analyze_slides(
     # Final yield with complete results
     # Hide settings table if only one slide, keep visible for multiple slides
     settings_visible = len(slides) > 1
     # Store final results before cleanup
     final_slide_masks = all_slide_masks
     final_combined_paladin = combined_paladin_results if len(combined_paladin_results) > 0 else None
-    # Aggressive memory cleanup after storing final results
     import gc
-    import torch
-    # Clear intermediate data structures
     all_slide_masks = None
     all_aeon_results = None
     all_paladin_results = None
     combined_paladin_results = None
-    # Force garbage collection
     gc.collect()
-    # Clear GPU cache if available
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        torch.cuda.empty_cache()
-        mem_allocated = torch.cuda.memory_allocated() / (1024**3)
-        mem_reserved = torch.cuda.memory_reserved() / (1024**3)
-        logger.info(f"GPU memory after final cleanup: {mem_allocated:.2f} GB allocated, {mem_reserved:.2f} GB reserved")
     yield (
         gr.Dataframe(value=settings_input, visible=settings_visible),  # Hide if single slide
         final_slide_masks,
@@ -626,7 +617,10 @@ def launch_gradio(server_name, server_port, share):
             outputs=[user_dir_state],
         )
-    demo.queue(max_size=10, default_concurrency_limit=8)
     demo.launch(
         server_name=server_name,
         share=share,

 )
 from mosaic.analysis import analyze_slide
 from mosaic.model_manager import load_all_models
+from mosaic.hardware import DEFAULT_CONCURRENCY_LIMIT, IS_T4_GPU
 current_dir = Path(__file__).parent.parent
     # Final yield with complete results
     # Hide settings table if only one slide, keep visible for multiple slides
     settings_visible = len(slides) > 1
     # Store final results before cleanup
     final_slide_masks = all_slide_masks
     final_combined_paladin = combined_paladin_results if len(combined_paladin_results) > 0 else None
+    # Memory cleanup: Clear intermediate data structures from RAM
     import gc
     all_slide_masks = None
     all_aeon_results = None
     all_paladin_results = None
     combined_paladin_results = None
+    # Force garbage collection to free Python memory
     gc.collect()
     yield (
         gr.Dataframe(value=settings_input, visible=settings_visible),  # Hide if single slide
         final_slide_masks,
             outputs=[user_dir_state],
         )
+    # Use hardware-specific concurrency limit
+    # T4 GPUs (16GB) can only handle one analysis at a time to prevent OOM
+    # Higher-memory GPUs and ZeroGPU can handle multiple concurrent analyses
+    demo.queue(max_size=10, default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT)
     demo.launch(
         server_name=server_name,
         share=share,