raylim Claude Sonnet 4.5 commited on
Commit
6e06a36
Β·
unverified Β·
1 Parent(s): 42bcf72

Add comprehensive logging for batch processing verification

Browse files

Improves logging throughout the batch processing pipeline to clearly
demonstrate that the optimization is working correctly.

Model Manager (src/mosaic/model_manager.py):
- Add detailed GPU detection and memory reporting
- Show total GPU memory and memory before/after loading each model
- Log memory management strategy (T4 aggressive vs A100 caching)
- Clear banner indicating models loaded ONCE per batch
- Show Paladin model cache hits vs new loads with βœ“ indicators
- Distinguish between aggressive mode (load/free) and caching mode

Batch Analysis (src/mosaic/batch_analysis.py):
- Add comprehensive timing for entire batch and per-slide processing
- Log batch start with clear separator
- Show per-slide progress with [n/total] indicators
- Track and report slide processing times (avg, min, max, total)
- Calculate and display batch overhead vs processing time
- Add detailed summary at end with:
* Success/failure counts
* Model loading time (done once)
* Total batch time
* Per-slide statistics
* Efficiency metrics
* Optimization benefits summary

Pipeline Analysis (src/mosaic/analysis.py):
- Indicate when using PRE-LOADED models (marker classifier, Aeon)
- Makes it clear models are being reused, not reloaded

Example output format:
================================================================================
BATCH PROCESSING: Starting analysis of 10 slides
================================================================================
GPU detected: NVIDIA Tesla T4
Memory management strategy: AGGRESSIVE (T4)
Loading models...
βœ“ Marker Classifier loaded (GPU: 0.15 GB)
βœ“ Aeon model loaded (GPU: 0.45 GB)
βœ“ All core models loaded (Total: 0.45 GB)
These models will be REUSED for all slides in this batch
Model loading completed in 3.2s

[1/10] Processing: slide1.svs
Using pre-loaded models (no disk I/O for core models)
[1/10] βœ“ Completed in 45.2s

BATCH PROCESSING SUMMARY
Total slides: 10
Successfully processed: 10
Model loading time: 3.2s (done ONCE for entire batch)
Total batch time: 458.5s
Per-slide times: Avg: 45.5s, Min: 42.1s, Max: 48.3s
βœ“ Batch processing optimization benefits:
- Models loaded ONCE (not once per slide)
- Reduced disk I/O for model loading

πŸ€– Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

src/mosaic/analysis.py CHANGED
@@ -540,7 +540,7 @@ def _run_inference_pipeline_with_models(
540
  # Step 2: Filter features using pre-loaded marker classifier
541
  start_time = pd.Timestamp.now()
542
  progress(0.35, desc="Filtering features with marker classifier")
543
- logger.info("Filtering features with marker classifier")
544
  _, filtered_coords = filter_features(
545
  ctranspath_features,
546
  coords,
@@ -563,7 +563,7 @@ def _run_inference_pipeline_with_models(
563
 
564
  # Check if cancer subtype is unknown
565
  if cancer_subtype in ["Unknown", None]:
566
- logger.info("Running Aeon inference (cancer subtype unknown)")
567
  aeon_results = _run_aeon_inference_with_model(
568
  features,
569
  model_cache.aeon_model, # Use pre-loaded Aeon model
 
540
  # Step 2: Filter features using pre-loaded marker classifier
541
  start_time = pd.Timestamp.now()
542
  progress(0.35, desc="Filtering features with marker classifier")
543
+ logger.info("Filtering features with PRE-LOADED marker classifier")
544
  _, filtered_coords = filter_features(
545
  ctranspath_features,
546
  coords,
 
563
 
564
  # Check if cancer subtype is unknown
565
  if cancer_subtype in ["Unknown", None]:
566
+ logger.info("Running Aeon inference with PRE-LOADED model (cancer subtype unknown)")
567
  aeon_results = _run_aeon_inference_with_model(
568
  features,
569
  model_cache.aeon_model, # Use pre-loaded Aeon model
src/mosaic/batch_analysis.py CHANGED
@@ -7,6 +7,7 @@ overhead compared to processing slides individually.
7
 
8
  from typing import Dict, List, Optional, Tuple
9
  import pandas as pd
 
10
  from loguru import logger
11
 
12
  from mosaic.model_manager import load_all_models
@@ -77,24 +78,31 @@ def analyze_slides_batch(
77
  progress = lambda frac, desc: None # No-op progress function
78
 
79
  num_slides = len(slides)
80
- logger.info(f"Starting batch analysis of {num_slides} slides with models loaded once")
 
 
 
 
81
 
82
  # Step 1: Load all models once
83
- logger.info("Loading models for batch processing...")
84
  progress(0.0, desc="Loading models for batch processing")
 
85
 
86
  try:
87
  model_cache = load_all_models(
88
  use_gpu=True,
89
  aggressive_memory_mgmt=aggressive_memory_mgmt,
90
  )
91
- logger.info("Models loaded successfully")
 
 
 
92
 
93
  # Log memory strategy
94
  if model_cache.aggressive_memory_mgmt:
95
  logger.info(
96
- "Using aggressive memory management (T4-style): "
97
- "Paladin models will be loaded and freed per slide"
98
  )
99
  else:
100
  logger.info(
@@ -110,6 +118,11 @@ def analyze_slides_batch(
110
  all_slide_masks = []
111
  all_aeon_results = []
112
  all_paladin_results = []
 
 
 
 
 
113
 
114
  try:
115
  for idx, (slide_path, (_, row)) in enumerate(zip(slides, settings_df.iterrows())):
@@ -119,7 +132,10 @@ def analyze_slides_batch(
119
  progress_frac = (idx + 0.1) / num_slides
120
  progress(progress_frac, desc=f"Analyzing slide {idx + 1}/{num_slides}: {slide_name}")
121
 
122
- logger.info(f"Processing slide {idx + 1}/{num_slides}: {slide_name}")
 
 
 
123
 
124
  try:
125
  # Use batch-optimized analysis with pre-loaded models
@@ -137,6 +153,9 @@ def analyze_slides_batch(
137
  progress=progress,
138
  )
139
 
 
 
 
140
  # Collect results
141
  if slide_mask is not None:
142
  all_slide_masks.append((slide_mask, slide_name))
@@ -154,24 +173,66 @@ def analyze_slides_batch(
154
  )
155
  all_paladin_results.append(paladin_results)
156
 
157
- logger.info(f"Successfully processed slide {idx + 1}/{num_slides}")
158
 
159
  except Exception as e:
160
- logger.exception(f"Error processing slide {slide_name}: {e}")
 
 
161
  # Continue with next slide instead of failing entire batch
162
  continue
163
 
164
  finally:
165
  # Step 3: Always cleanup models (even if there were errors)
 
 
166
  logger.info("Cleaning up models...")
167
  progress(0.99, desc="Cleaning up models")
168
  model_cache.cleanup()
169
- logger.info("Model cleanup complete")
170
-
171
- progress(1.0, desc=f"Batch analysis complete ({num_slides} slides)")
172
- logger.info(
173
- f"Batch analysis complete: "
174
- f"Processed {len(all_slide_masks)}/{num_slides} slides successfully"
175
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  return all_slide_masks, all_aeon_results, all_paladin_results
 
7
 
8
  from typing import Dict, List, Optional, Tuple
9
  import pandas as pd
10
+ import time
11
  from loguru import logger
12
 
13
  from mosaic.model_manager import load_all_models
 
78
  progress = lambda frac, desc: None # No-op progress function
79
 
80
  num_slides = len(slides)
81
+ batch_start_time = time.time()
82
+
83
+ logger.info("=" * 80)
84
+ logger.info(f"BATCH PROCESSING: Starting analysis of {num_slides} slides")
85
+ logger.info("=" * 80)
86
 
87
  # Step 1: Load all models once
 
88
  progress(0.0, desc="Loading models for batch processing")
89
+ model_load_start = time.time()
90
 
91
  try:
92
  model_cache = load_all_models(
93
  use_gpu=True,
94
  aggressive_memory_mgmt=aggressive_memory_mgmt,
95
  )
96
+
97
+ model_load_time = time.time() - model_load_start
98
+ logger.info(f"Model loading completed in {model_load_time:.2f}s")
99
+ logger.info("")
100
 
101
  # Log memory strategy
102
  if model_cache.aggressive_memory_mgmt:
103
  logger.info(
104
+ "Memory strategy: AGGRESSIVE (T4-style) - "
105
+ "Paladin models loaded/freed per slide"
106
  )
107
  else:
108
  logger.info(
 
118
  all_slide_masks = []
119
  all_aeon_results = []
120
  all_paladin_results = []
121
+ slide_times = []
122
+
123
+ logger.info("=" * 80)
124
+ logger.info("Processing slides with PRE-LOADED models (no model reloading!)")
125
+ logger.info("=" * 80)
126
 
127
  try:
128
  for idx, (slide_path, (_, row)) in enumerate(zip(slides, settings_df.iterrows())):
 
132
  progress_frac = (idx + 0.1) / num_slides
133
  progress(progress_frac, desc=f"Analyzing slide {idx + 1}/{num_slides}: {slide_name}")
134
 
135
+ logger.info("")
136
+ logger.info(f"[{idx + 1}/{num_slides}] Processing: {slide_name}")
137
+ logger.info(f" Using pre-loaded models (no disk I/O for core models)")
138
+ slide_start_time = time.time()
139
 
140
  try:
141
  # Use batch-optimized analysis with pre-loaded models
 
153
  progress=progress,
154
  )
155
 
156
+ slide_time = time.time() - slide_start_time
157
+ slide_times.append(slide_time)
158
+
159
  # Collect results
160
  if slide_mask is not None:
161
  all_slide_masks.append((slide_mask, slide_name))
 
173
  )
174
  all_paladin_results.append(paladin_results)
175
 
176
+ logger.info(f"[{idx + 1}/{num_slides}] βœ“ Completed in {slide_time:.2f}s")
177
 
178
  except Exception as e:
179
+ slide_time = time.time() - slide_start_time
180
+ slide_times.append(slide_time)
181
+ logger.exception(f"[{idx + 1}/{num_slides}] βœ— Failed after {slide_time:.2f}s: {e}")
182
  # Continue with next slide instead of failing entire batch
183
  continue
184
 
185
  finally:
186
  # Step 3: Always cleanup models (even if there were errors)
187
+ logger.info("")
188
+ logger.info("=" * 80)
189
  logger.info("Cleaning up models...")
190
  progress(0.99, desc="Cleaning up models")
191
  model_cache.cleanup()
192
+ logger.info("βœ“ Model cleanup complete")
193
+
194
+ # Calculate batch statistics
195
+ batch_total_time = time.time() - batch_start_time
196
+ num_successful = len(all_slide_masks)
197
+ num_failed = num_slides - num_successful
198
+
199
+ # Log comprehensive summary
200
+ logger.info("=" * 80)
201
+ logger.info("BATCH PROCESSING SUMMARY")
202
+ logger.info("=" * 80)
203
+ logger.info(f"Total slides: {num_slides}")
204
+ logger.info(f"Successfully processed: {num_successful}")
205
+ logger.info(f"Failed: {num_failed}")
206
+ logger.info("")
207
+ logger.info(f"Model loading time: {model_load_time:.2f}s (done ONCE for entire batch)")
208
+ logger.info(f"Total batch time: {batch_total_time:.2f}s")
209
+
210
+ if slide_times:
211
+ avg_slide_time = sum(slide_times) / len(slide_times)
212
+ min_slide_time = min(slide_times)
213
+ max_slide_time = max(slide_times)
214
+ total_slide_time = sum(slide_times)
215
+
216
+ logger.info("")
217
+ logger.info("Per-slide processing times:")
218
+ logger.info(f" Average: {avg_slide_time:.2f}s")
219
+ logger.info(f" Min: {min_slide_time:.2f}s")
220
+ logger.info(f" Max: {max_slide_time:.2f}s")
221
+ logger.info(f" Total: {total_slide_time:.2f}s")
222
+
223
+ # Calculate efficiency
224
+ overhead_time = batch_total_time - total_slide_time
225
+ logger.info("")
226
+ logger.info(f"Batch overhead: {overhead_time:.2f}s ({overhead_time/batch_total_time*100:.1f}%)")
227
+ logger.info(f"Slide processing: {total_slide_time:.2f}s ({total_slide_time/batch_total_time*100:.1f}%)")
228
+
229
+ logger.info("")
230
+ logger.info("βœ“ Batch processing optimization benefits:")
231
+ logger.info(" - Models loaded ONCE (not once per slide)")
232
+ logger.info(" - Reduced disk I/O for model loading")
233
+ logger.info(f" - Processed {num_slides} slides with shared model cache")
234
+ logger.info("=" * 80)
235
+
236
+ progress(1.0, desc=f"Batch analysis complete ({num_successful}/{num_slides} successful)")
237
 
238
  return all_slide_masks, all_aeon_results, all_paladin_results
src/mosaic/model_manager.py CHANGED
@@ -117,7 +117,9 @@ def load_all_models(
117
  FileNotFoundError: If model files are not found in data/ directory
118
  RuntimeError: If CUDA is requested but not available
119
  """
120
- logger.info("Loading models for batch processing...")
 
 
121
 
122
  # Detect GPU type
123
  device = torch.device("cpu")
@@ -127,15 +129,23 @@ def load_all_models(
127
  device = torch.device("cuda")
128
  gpu_name = torch.cuda.get_device_name(0)
129
  is_t4_gpu = "T4" in gpu_name
130
- logger.info(f"Detected GPU: {gpu_name}")
 
 
 
 
 
 
131
 
132
  # Auto-detect memory management strategy
133
  if aggressive_memory_mgmt is None:
134
  aggressive_memory_mgmt = is_t4_gpu
135
- logger.info(
136
- f"Auto-detected memory management: "
137
- f"{'aggressive (T4)' if is_t4_gpu else 'caching (high-memory GPU)'}"
138
- )
 
 
139
  elif use_gpu and not torch.cuda.is_available():
140
  logger.warning("GPU requested but CUDA not available, falling back to CPU")
141
  use_gpu = False
@@ -173,7 +183,11 @@ def load_all_models(
173
 
174
  with open(marker_classifier_path, "rb") as f:
175
  marker_classifier = pickle.load(f) # nosec
176
- logger.info("Marker Classifier loaded successfully")
 
 
 
 
177
 
178
  # Load Aeon model
179
  logger.info("Loading Aeon model...")
@@ -185,12 +199,23 @@ def load_all_models(
185
  aeon_model = pickle.load(f) # nosec
186
  aeon_model.to(device)
187
  aeon_model.eval()
188
- logger.info("Aeon model loaded successfully")
189
 
190
- # Log memory usage
 
 
 
 
 
191
  if use_gpu and torch.cuda.is_available():
192
  mem_allocated = torch.cuda.memory_allocated() / (1024**3)
193
- logger.info(f"GPU memory after loading core models: {mem_allocated:.2f} GB")
 
 
 
 
 
 
194
 
195
  # Create ModelCache
196
  cache = ModelCache(
@@ -203,7 +228,6 @@ def load_all_models(
203
  device=device,
204
  )
205
 
206
- logger.info("All core models loaded successfully")
207
  return cache
208
 
209
 
@@ -232,11 +256,15 @@ def load_paladin_model_for_inference(
232
 
233
  # Check cache first (only used in non-aggressive mode)
234
  if not cache.aggressive_memory_mgmt and model_key in cache.paladin_models:
235
- logger.debug(f"Using cached Paladin model: {model_path.name}")
236
  return cache.paladin_models[model_key]
237
 
238
  # Load model from disk
239
- logger.debug(f"Loading Paladin model: {model_path.name}")
 
 
 
 
240
  with open(model_path, "rb") as f:
241
  model = pickle.load(f) # nosec
242
 
@@ -246,6 +274,6 @@ def load_paladin_model_for_inference(
246
  # Cache if not in aggressive mode
247
  if not cache.aggressive_memory_mgmt:
248
  cache.paladin_models[model_key] = model
249
- logger.debug(f"Cached Paladin model: {model_path.name}")
250
 
251
  return model
 
117
  FileNotFoundError: If model files are not found in data/ directory
118
  RuntimeError: If CUDA is requested but not available
119
  """
120
+ logger.info("=" * 80)
121
+ logger.info("BATCH PROCESSING: Loading models (this happens ONCE per batch)")
122
+ logger.info("=" * 80)
123
 
124
  # Detect GPU type
125
  device = torch.device("cpu")
 
129
  device = torch.device("cuda")
130
  gpu_name = torch.cuda.get_device_name(0)
131
  is_t4_gpu = "T4" in gpu_name
132
+ gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
133
+ logger.info(f"GPU detected: {gpu_name}")
134
+ logger.info(f"GPU total memory: {gpu_memory_total:.2f} GB")
135
+
136
+ # Log initial GPU memory
137
+ mem_before = torch.cuda.memory_allocated() / (1024**3)
138
+ logger.info(f"GPU memory before loading models: {mem_before:.2f} GB")
139
 
140
  # Auto-detect memory management strategy
141
  if aggressive_memory_mgmt is None:
142
  aggressive_memory_mgmt = is_t4_gpu
143
+ strategy = "AGGRESSIVE (T4)" if is_t4_gpu else "CACHING (High-Memory GPU)"
144
+ logger.info(f"Memory management strategy: {strategy}")
145
+ if is_t4_gpu:
146
+ logger.info(" β†’ Paladin models will be loaded and freed per slide")
147
+ else:
148
+ logger.info(" β†’ Paladin models will be cached and reused across slides")
149
  elif use_gpu and not torch.cuda.is_available():
150
  logger.warning("GPU requested but CUDA not available, falling back to CPU")
151
  use_gpu = False
 
183
 
184
  with open(marker_classifier_path, "rb") as f:
185
  marker_classifier = pickle.load(f) # nosec
186
+ logger.info("βœ“ Marker Classifier loaded")
187
+
188
+ if use_gpu and torch.cuda.is_available():
189
+ mem = torch.cuda.memory_allocated() / (1024**3)
190
+ logger.info(f" GPU memory: {mem:.2f} GB")
191
 
192
  # Load Aeon model
193
  logger.info("Loading Aeon model...")
 
199
  aeon_model = pickle.load(f) # nosec
200
  aeon_model.to(device)
201
  aeon_model.eval()
202
+ logger.info("βœ“ Aeon model loaded")
203
 
204
+ if use_gpu and torch.cuda.is_available():
205
+ mem = torch.cuda.memory_allocated() / (1024**3)
206
+ logger.info(f" GPU memory: {mem:.2f} GB")
207
+
208
+ # Log final memory usage
209
+ logger.info("-" * 80)
210
  if use_gpu and torch.cuda.is_available():
211
  mem_allocated = torch.cuda.memory_allocated() / (1024**3)
212
+ logger.info(f"βœ“ All core models loaded to GPU")
213
+ logger.info(f" Total GPU memory used: {mem_allocated:.2f} GB")
214
+ logger.info(f" These models will be REUSED for all slides in this batch")
215
+ else:
216
+ logger.info("βœ“ All core models loaded to CPU")
217
+ logger.info(" These models will be REUSED for all slides in this batch")
218
+ logger.info("-" * 80)
219
 
220
  # Create ModelCache
221
  cache = ModelCache(
 
228
  device=device,
229
  )
230
 
 
231
  return cache
232
 
233
 
 
256
 
257
  # Check cache first (only used in non-aggressive mode)
258
  if not cache.aggressive_memory_mgmt and model_key in cache.paladin_models:
259
+ logger.info(f" βœ“ Using CACHED Paladin model: {model_path.name} (no disk I/O!)")
260
  return cache.paladin_models[model_key]
261
 
262
  # Load model from disk
263
+ if cache.aggressive_memory_mgmt:
264
+ logger.info(f" β†’ Loading Paladin model: {model_path.name} (will free after use)")
265
+ else:
266
+ logger.info(f" β†’ Loading Paladin model: {model_path.name} (will cache for reuse)")
267
+
268
  with open(model_path, "rb") as f:
269
  model = pickle.load(f) # nosec
270
 
 
274
  # Cache if not in aggressive mode
275
  if not cache.aggressive_memory_mgmt:
276
  cache.paladin_models[model_key] = model
277
+ logger.info(f" βœ“ Cached Paladin model for future reuse")
278
 
279
  return model