raylim commited on
Commit
c6bd865
·
unverified ·
1 Parent(s): aafc601

Make all GPU memory stats collection optional with try-except

Browse files

- Wrap all max_memory_allocated and reset_peak_memory_stats calls in try-except
- Prevents any CUDA-related errors from breaking execution
- Logs timing info even if memory stats fail
- Applied to CTransPath, Optimus, Aeon, and Paladin functions
- Ensures robustness on ZeroGPU and other GPU environments

Files changed (1) hide show
  1. src/mosaic/analysis.py +48 -36
src/mosaic/analysis.py CHANGED
@@ -117,16 +117,19 @@ def _extract_ctranspath_features(coords, slide_path, attrs, num_workers):
117
  ctranspath_features = np.concatenate(all_features, axis=0)
118
 
119
  end_time = pd.Timestamp.now()
120
- max_gpu_memory = (
121
- torch.cuda.max_memory_allocated() / (1024**3)
122
- if torch.cuda.is_available()
123
- else 0
124
- )
125
- logger.info(
126
- f"CTransPath extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
127
- )
128
  if torch.cuda.is_available():
129
- torch.cuda.reset_peak_memory_stats()
 
 
 
 
 
 
 
 
 
130
 
131
  return ctranspath_features, coords
132
 
@@ -209,16 +212,19 @@ def _extract_optimus_features(filtered_coords, slide_path, attrs, num_workers):
209
  features = np.concatenate(all_features, axis=0)
210
 
211
  end_time = pd.Timestamp.now()
212
- max_gpu_memory = (
213
- torch.cuda.max_memory_allocated() / (1024**3)
214
- if torch.cuda.is_available()
215
- else 0
216
- )
217
- logger.info(
218
- f"Optimus extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
219
- )
220
  if torch.cuda.is_available():
221
- torch.cuda.reset_peak_memory_stats()
 
 
 
 
 
 
 
 
 
222
 
223
  return features
224
 
@@ -253,16 +259,19 @@ def _run_aeon_inference(features, site_type, num_workers):
253
  use_cpu=False,
254
  )
255
  end_time = pd.Timestamp.now()
256
- max_gpu_memory = (
257
- torch.cuda.max_memory_allocated() / (1024**3)
258
- if torch.cuda.is_available()
259
- else 0
260
- )
261
- logger.info(
262
- f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
263
- )
264
  if torch.cuda.is_available():
265
- torch.cuda.reset_peak_memory_stats()
 
 
 
 
 
 
 
 
 
266
 
267
  return aeon_results
268
 
@@ -299,16 +308,19 @@ def _run_paladin_inference(features, aeon_results, site_type, num_workers):
299
  use_cpu=False,
300
  )
301
  end_time = pd.Timestamp.now()
302
- max_gpu_memory = (
303
- torch.cuda.max_memory_allocated() / (1024**3)
304
- if torch.cuda.is_available()
305
- else 0
306
- )
307
- logger.info(
308
- f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
309
- )
310
  if torch.cuda.is_available():
311
- torch.cuda.reset_peak_memory_stats()
 
 
 
 
 
 
 
 
 
312
 
313
  return paladin_results
314
 
 
117
  ctranspath_features = np.concatenate(all_features, axis=0)
118
 
119
  end_time = pd.Timestamp.now()
120
+
121
+ # Log memory stats if CUDA is available
 
 
 
 
 
 
122
  if torch.cuda.is_available():
123
+ try:
124
+ max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
125
+ logger.info(
126
+ f"CTransPath extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
127
+ )
128
+ torch.cuda.reset_peak_memory_stats()
129
+ except Exception:
130
+ logger.info(f"CTransPath extraction took {end_time - start_time}")
131
+ else:
132
+ logger.info(f"CTransPath extraction took {end_time - start_time}")
133
 
134
  return ctranspath_features, coords
135
 
 
212
  features = np.concatenate(all_features, axis=0)
213
 
214
  end_time = pd.Timestamp.now()
215
+
216
+ # Log memory stats if CUDA is available
 
 
 
 
 
 
217
  if torch.cuda.is_available():
218
+ try:
219
+ max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
220
+ logger.info(
221
+ f"Optimus extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
222
+ )
223
+ torch.cuda.reset_peak_memory_stats()
224
+ except Exception:
225
+ logger.info(f"Optimus extraction took {end_time - start_time}")
226
+ else:
227
+ logger.info(f"Optimus extraction took {end_time - start_time}")
228
 
229
  return features
230
 
 
259
  use_cpu=False,
260
  )
261
  end_time = pd.Timestamp.now()
262
+
263
+ # Log memory stats if CUDA is available
 
 
 
 
 
 
264
  if torch.cuda.is_available():
265
+ try:
266
+ max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
267
+ logger.info(
268
+ f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
269
+ )
270
+ torch.cuda.reset_peak_memory_stats()
271
+ except Exception:
272
+ logger.info(f"Aeon inference took {end_time - start_time}")
273
+ else:
274
+ logger.info(f"Aeon inference took {end_time - start_time}")
275
 
276
  return aeon_results
277
 
 
308
  use_cpu=False,
309
  )
310
  end_time = pd.Timestamp.now()
311
+
312
+ # Log memory stats if CUDA is available
 
 
 
 
 
 
313
  if torch.cuda.is_available():
314
+ try:
315
+ max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
316
+ logger.info(
317
+ f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
318
+ )
319
+ torch.cuda.reset_peak_memory_stats()
320
+ except Exception:
321
+ logger.info(f"Paladin inference took {end_time - start_time}")
322
+ else:
323
+ logger.info(f"Paladin inference took {end_time - start_time}")
324
 
325
  return paladin_results
326